activeloopai / deeplake
Showing 21 of 30 files from the diff.

@@ -94,6 +94,7 @@
Loading
94 94
from deeplake.core.sample import Sample
95 95
from itertools import chain, repeat
96 96
from collections.abc import Iterable
97 +
from PIL import Image  # type: ignore
97 98
98 99
99 100
class ChunkEngine:
@@ -1481,7 +1482,8 @@
Loading
1481 1482
        cast: bool = True,
1482 1483
        copy: bool = False,
1483 1484
        decompress: bool = True,
1484 -
    ) -> np.ndarray:
1485 +
        to_pil: bool = False,
1486 +
    ) -> Union[np.ndarray, Image.Image]:
1485 1487
        enc = self.chunk_id_encoder
1486 1488
        if self.is_fixed_shape and self.sample_compression is None:
1487 1489
            num_samples_per_chunk = self.num_samples_per_chunk
@@ -1490,6 +1492,16 @@
Loading
1490 1492
            local_sample_index = enc.translate_index_relative_to_chunks(
1491 1493
                global_sample_index
1492 1494
            )
1495 +
        if to_pil:
1496 +
            assert isinstance(chunk, SampleCompressedChunk)
1497 +
            return chunk.read_sample(
1498 +
                local_sample_index,
1499 +
                cast=cast,
1500 +
                copy=copy,
1501 +
                decompress=decompress,
1502 +
                to_pil=True,
1503 +
            )
1504 +
1493 1505
        return chunk.read_sample(
1494 1506
            local_sample_index, cast=cast, copy=copy, decompress=decompress
1495 1507
        )

@@ -18,7 +18,8 @@
Loading
18 18
from deeplake.util.link import get_path_creds_key, save_link_creds
19 19
from deeplake.util.video import normalize_index
20 20
import numpy as np
21 -
from typing import Optional, Dict, Any, Tuple
21 +
from typing import Optional, Dict, Any, Tuple, Union
22 +
from PIL import Image  # type: ignore
22 23
23 24
24 25
def retry_refresh_managed_creds(fn):
@@ -239,7 +240,8 @@
Loading
239 240
        cast: bool = True,
240 241
        copy: bool = False,
241 242
        decompress: bool = True,
242 -
    ) -> np.ndarray:
243 +
        to_pil: bool = False,
244 +
    ) -> Union[np.ndarray, Image.Image]:
243 245
        enc = self.chunk_id_encoder
244 246
        local_sample_index = enc.translate_index_relative_to_chunks(global_sample_index)
245 247
        sample_path = chunk.read_sample(
@@ -251,7 +253,12 @@
Loading
251 253
            return self.get_empty_sample()
252 254
        sample_creds_encoded = creds_encoder.get_encoded_creds_key(global_sample_index)
253 255
        sample_creds_key = self.link_creds.get_creds_key(sample_creds_encoded)
254 -
        return read_linked_sample(sample_path, sample_creds_key, self.link_creds, False)
256 +
        read_sample = read_linked_sample(
257 +
            sample_path, sample_creds_key, self.link_creds, False
258 +
        )
259 +
        if to_pil:
260 +
            return read_sample.pil
261 +
        return read_sample.array
255 262
256 263
    def check_link_ready(self):
257 264
        missing_keys = self.link_creds.missing_keys

@@ -1,4 +1,4 @@
Loading
1 -
from typing import Iterable, Optional, Sequence, List, Union
1 +
from typing import Iterable, Optional, Sequence, List, Union, Dict
2 2
from deeplake.constants import MB
3 3
from deeplake.integrations.pytorch.common import PytorchTransformFunction
4 4
@@ -31,6 +31,7 @@
Loading
31 31
32 32
import numpy as np
33 33
import deeplake
34 +
from PIL import Image  # type: ignore
34 35
35 36
36 37
mp = torch.multiprocessing.get_context()
@@ -64,6 +65,9 @@
Loading
64 65
def copy_tensor(x):
65 66
    if isinstance(x, Sample):
66 67
        x = x.array
68 +
    if isinstance(x, Image.Image):
69 +
        return x
70 +
67 71
    try:
68 72
        copy = cast_type(x)
69 73
    except AttributeError:
@@ -402,23 +406,26 @@
Loading
402 406
    def __init__(
403 407
        self,
404 408
        dataset,
409 +
        tensors: Sequence[str],
405 410
        use_local_cache: bool = False,
406 -
        tensors: Optional[Sequence[str]] = None,
407 -
        tobytes: Union[bool, Sequence[str]] = False,
408 411
        transform: Optional[PytorchTransformFunction] = PytorchTransformFunction(),
409 412
        num_workers: int = 1,
410 413
        shuffle: bool = False,
411 414
        buffer_size: int = 0,
412 415
        return_index: bool = True,
413 416
        pad_tensors: bool = False,
417 +
        decode_method: Optional[Dict[str, str]] = None,
414 418
    ) -> None:
415 419
        super().__init__()
416 420
417 421
        self.dataset = dataset
418 422
        self.transform = transform
419 423
        self.tensors = tensors
420 -
        self.tobytes = tobytes
424 +
        self.shuffle: bool = shuffle
425 +
        self.buffer_size: int = buffer_size * MB
426 +
        self.return_index: bool = return_index
421 427
        self.pad_tensors = pad_tensors
428 +
        self.decode_method = decode_method
422 429
423 430
        self.use_local_cache = use_local_cache
424 431
        self.scheduler = use_scheduler(num_workers, shuffle)
@@ -432,19 +439,15 @@
Loading
432 439
        streaming = SampleStreaming(
433 440
            dataset,
434 441
            tensors=self.tensors,  # type: ignore
435 -
            tobytes=self.tobytes,
436 442
            use_local_cache=use_local_cache,
437 443
            pad_tensors=self.pad_tensors,
444 +
            decode_method=self.decode_method,
438 445
        )
439 446
440 447
        self.schedules: List[Schedule] = self.scheduler.schedule(
441 448
            streaming.list_blocks()
442 449
        )
443 450
444 -
        self.shuffle: bool = shuffle
445 -
        self.buffer_size: int = buffer_size * MB
446 -
        self.return_index: bool = return_index
447 -
448 451
    def __iter__(self):
449 452
        worker_info = torch.utils.data.get_worker_info()
450 453
        schedule: Schedule = self.schedules[0]
@@ -455,10 +458,10 @@
Loading
455 458
        streaming = SampleStreaming(
456 459
            self.dataset,
457 460
            tensors=self.tensors,
458 -
            tobytes=self.tobytes,
459 461
            use_local_cache=self.use_local_cache,
460 462
            return_index=self.return_index,
461 463
            pad_tensors=self.pad_tensors,
464 +
            decode_method=self.decode_method,
462 465
        )
463 466
464 467
        if self.shuffle:
@@ -477,28 +480,28 @@
Loading
477 480
    def __init__(
478 481
        self,
479 482
        dataset,
483 +
        tensors: Sequence[str],
480 484
        use_local_cache: bool = False,
481 -
        tensors: Optional[Sequence[str]] = None,
482 -
        tobytes: Union[bool, Sequence[str]] = False,
483 485
        transform: PytorchTransformFunction = PytorchTransformFunction(),
484 486
        num_workers: int = 1,
485 487
        buffer_size: int = 512,
486 488
        batch_size: int = 1,
487 489
        return_index: bool = True,
488 490
        pad_tensors: bool = False,
491 +
        decode_method: Optional[Dict[str, str]] = None,
489 492
    ) -> None:
490 493
        super().__init__()
491 494
492 495
        self.torch_datset = TorchDataset(
493 496
            dataset,
494 -
            use_local_cache,
495 -
            tensors,
496 -
            tobytes,
497 +
            tensors=tensors,
498 +
            use_local_cache=use_local_cache,
497 499
            transform=None if buffer_size else transform,
498 500
            num_workers=num_workers,
499 501
            shuffle=True,
500 502
            return_index=return_index,
501 503
            pad_tensors=pad_tensors,
504 +
            decode_method=decode_method,
502 505
        )
503 506
        if buffer_size:
504 507
            self.transform = transform

@@ -13,6 +13,7 @@
Loading
13 13
from deeplake.constants import KB
14 14
15 15
from deeplake.tests.dataset_fixtures import enabled_non_gdrive_datasets
16 +
from PIL import Image  # type: ignore
16 17
17 18
try:
18 19
    from torch.utils.data._utils.collate import default_collate
@@ -639,10 +640,14 @@
Loading
639 640
    assert s == sum(list(range(254)))
640 641
641 642
643 +
def identity(x):
644 +
    return x
645 +
646 +
642 647
@requires_torch
643 648
@enabled_non_gdrive_datasets
644 649
@pytest.mark.parametrize("compression", [None, "jpeg"])
645 -
def test_pytorch_tobytes(ds, compressed_image_paths, compression):
650 +
def test_pytorch_decode(ds, compressed_image_paths, compression):
646 651
    with ds:
647 652
        ds.create_tensor("image", sample_compression=compression)
648 653
        ds.image.extend(
@@ -654,7 +659,7 @@
Loading
654 659
            ds.pytorch()
655 660
        return
656 661
657 -
    for i, batch in enumerate(ds.pytorch(tobytes=["image"])):
662 +
    for i, batch in enumerate(ds.pytorch(decode_method={"image": "tobytes"})):
658 663
        image = batch["image"][0]
659 664
        assert isinstance(image, bytes)
660 665
        if i < 5 and not compression:
@@ -666,6 +671,19 @@
Loading
666 671
            with open(compressed_image_paths["jpeg"][0], "rb") as f:
667 672
                assert f.read() == image
668 673
674 +
    if compression:
675 +
        ptds = ds.pytorch(decode_method={"image": "pil"}, collate_fn=identity)
676 +
        for i, batch in enumerate(ptds):
677 +
            image = batch[0]["image"]
678 +
            assert isinstance(image, Image.Image)
679 +
            if i < 5:
680 +
                np.testing.assert_array_equal(
681 +
                    np.array(image), i * np.ones((10, 10, 3), dtype=np.uint8)
682 +
                )
683 +
            elif i >= 5:
684 +
                with Image.open(compressed_image_paths["jpeg"][0]) as f:
685 +
                    np.testing.assert_array_equal(np.array(f), np.array(image))
686 +
669 687
670 688
@requires_torch
671 689
def test_rename(local_ds):

@@ -25,5 +25,8 @@
Loading
25 25
    assert arr.shape == (900, 900, 3)
26 26
    assert arr.dtype == np.uint32
27 27
28 +
    pil_img = flower.pil
29 +
    assert pil_img.size == (464, 513)
30 +
28 31
29 32
# TODO: test creating Sample with np.ndarray

@@ -1,6 +1,4 @@
Loading
1 1
import pickle
2 -
from deeplake.experimental import dataloader
3 -
4 2
import deeplake
5 3
import numpy as np
6 4
import pytest
@@ -14,6 +12,7 @@
Loading
14 12
from deeplake.constants import KB
15 13
16 14
from deeplake.tests.dataset_fixtures import enabled_non_gdrive_datasets
15 +
from PIL import Image  # type: ignore
17 16
18 17
try:
19 18
    from torch.utils.data._utils.collate import default_collate
@@ -63,9 +62,9 @@
Loading
63 62
64 63
    if isinstance(get_base_storage(ds.storage), (MemoryProvider, GCSProvider)):
65 64
        with pytest.raises(ValueError):
66 -
            dl = dataloader(ds)
65 +
            dl = ds.dataloader()
67 66
        return
68 -
    dl = dataloader(ds).batch(1).pytorch(num_workers=2)
67 +
    dl = ds.dataloader().batch(1).pytorch(num_workers=2)
69 68
70 69
    assert len(dl.dataset) == 16
71 70
@@ -79,7 +78,7 @@
Loading
79 78
            )
80 79
81 80
    sub_ds = ds[5:]
82 -
    sub_dl = dataloader(sub_ds).pytorch(num_workers=0)
81 +
    sub_dl = sub_ds.dataloader().pytorch(num_workers=0)
83 82
84 83
    for i, batch in enumerate(sub_dl):
85 84
        np.testing.assert_array_equal(
@@ -90,7 +89,7 @@
Loading
90 89
        )
91 90
92 91
    sub_ds2 = ds[8:12]
93 -
    sub_dl2 = dataloader(sub_ds2).pytorch(num_workers=0)
92 +
    sub_dl2 = sub_ds2.dataloader().pytorch(num_workers=0)
94 93
95 94
    for _ in range(2):
96 95
        for i, batch in enumerate(sub_dl2):
@@ -102,7 +101,7 @@
Loading
102 101
            )
103 102
104 103
    sub_ds3 = ds[:5]
105 -
    sub_dl3 = dataloader(sub_ds3).pytorch(num_workers=0)
104 +
    sub_dl3 = sub_ds3.dataloader().pytorch(num_workers=0)
106 105
107 106
    for _ in range(2):
108 107
        for i, batch in enumerate(sub_dl3):
@@ -127,11 +126,11 @@
Loading
127 126
128 127
    if isinstance(get_base_storage(ds.storage), (MemoryProvider, GCSProvider)):
129 128
        with pytest.raises(ValueError):
130 -
            dl = dataloader(ds)
129 +
            dl = ds.dataloader()
131 130
        return
132 131
133 132
    dl = (
134 -
        dataloader(ds)
133 +
        ds.dataloader()
135 134
        .batch(1)
136 135
        .transform(to_tuple, t1="image", t2="image2")
137 136
        .pytorch(num_workers=2)
@@ -161,10 +160,10 @@
Loading
161 160
162 161
    if isinstance(get_base_storage(ds.storage), (MemoryProvider, GCSProvider)):
163 162
        with pytest.raises(ValueError):
164 -
            dl = dataloader(ds)
163 +
            dl = ds.dataloader()
165 164
        return
166 165
167 -
    dl = dataloader(ds).transform({"image": double, "image2": None}).pytorch()
166 +
    dl = ds.dataloader().transform({"image": double, "image2": None}).pytorch()
168 167
169 168
    assert len(dl.dataset) == 16
170 169
@@ -209,10 +208,10 @@
Loading
209 208
210 209
    if isinstance(get_base_storage(ds.storage), (MemoryProvider, GCSProvider)):
211 210
        with pytest.raises(ValueError):
212 -
            dl = dataloader(ds)
211 +
            dl = ds.dataloader()
213 212
        return
214 213
215 -
    dl = dataloader(ds).pytorch(num_workers=0)
214 +
    dl = ds.dataloader().pytorch(num_workers=0)
216 215
217 216
    for _ in range(2):
218 217
        for batch in dl:
@@ -234,13 +233,13 @@
Loading
234 233
235 234
    if isinstance(get_base_storage(ds.storage), (MemoryProvider, GCSProvider)):
236 235
        with pytest.raises(ValueError):
237 -
            dl = dataloader(ds)
236 +
            dl = ds.dataloader()
238 237
        return
239 238
240 239
    with pytest.raises(TensorDoesNotExistError):
241 -
        dl = dataloader(ds).pytorch(tensors=["c", "d", "e"])
240 +
        dl = ds.dataloader().pytorch(tensors=["c", "d", "e"])
242 241
243 -
    dl = dataloader(ds).pytorch(tensors=["c", "d", "a"], return_index=False)
242 +
    dl = ds.dataloader().pytorch(tensors=["c", "d", "a"], return_index=False)
244 243
245 244
    for i, batch in enumerate(dl):
246 245
        c1, d1, a1 = batch
@@ -280,7 +279,7 @@
Loading
280 279
    base_storage.enable_readonly()
281 280
    ds = Dataset(storage=local_ds.storage, read_only=True, verbose=False)
282 281
283 -
    ptds = dataloader(ds).pytorch(num_workers=2)
282 +
    ptds = ds.dataloader().pytorch(num_workers=2)
284 283
    # no need to check input, only care that readonly works
285 284
    for _ in ptds:
286 285
        pass
@@ -313,7 +312,7 @@
Loading
313 312
            local_ds.images.pngs.flowers.append(img2)
314 313
315 314
    another_ds = deeplake.dataset(local_ds.path)
316 -
    dl = dataloader(another_ds).pytorch(return_index=False)
315 +
    dl = another_ds.dataloader().pytorch(return_index=False)
317 316
    for i, (cat, flower) in enumerate(dl):
318 317
        assert cat[0].shape == another_ds.images.jpegs.cats[i].numpy().shape
319 318
        assert flower[0].shape == another_ds.images.pngs.flowers[i].numpy().shape
@@ -326,7 +325,7 @@
Loading
326 325
        local_ds.create_tensor("strings", htype="text")
327 326
        local_ds.strings.extend([f"string{idx}" for idx in range(5)])
328 327
329 -
    ptds = dataloader(local_ds).pytorch()
328 +
    ptds = local_ds.dataloader().pytorch()
330 329
    for idx, batch in enumerate(ptds):
331 330
        np.testing.assert_array_equal(batch["strings"], f"string{idx}")
332 331
@@ -365,7 +364,7 @@
Loading
365 364
        ds.img2.extend(arr_list_2)
366 365
        ds.label.extend(label_list)
367 366
368 -
    ptds = dataloader(local_ds[index]).pytorch()
367 +
    ptds = local_ds[index].dataloader().pytorch()
369 368
    idxs = list(IndexEntry(index).indices(len(local_ds)))
370 369
    for idx, batch in enumerate(ptds):
371 370
        idx = idxs[idx]
@@ -387,7 +386,7 @@
Loading
387 386
            local_ds.b.append(1)
388 387
            local_ds.c.append(2)
389 388
390 -
    ptds = dataloader(local_ds).batch(4).pytorch(collate_fn=reorder_collate)
389 +
    ptds = local_ds.dataloader().batch(4).pytorch(collate_fn=reorder_collate)
391 390
    if shuffle:
392 391
        ptds = ptds.shuffle()
393 392
    for batch in ptds:
@@ -412,7 +411,7 @@
Loading
412 411
            local_ds.c.append(2 * np.ones((300, 300)))
413 412
414 413
    ptds = (
415 -
        dataloader(local_ds)
414 +
        local_ds.dataloader()
416 415
        .batch(4)
417 416
        .pytorch(
418 417
            collate_fn=my_transform_collate,
@@ -439,7 +438,7 @@
Loading
439 438
@requires_libdeeplake
440 439
@enabled_non_gdrive_datasets
441 440
@pytest.mark.parametrize("compression", [None, "jpeg"])
442 -
def test_pytorch_tobytes(ds, compressed_image_paths, compression):
441 +
def test_pytorch_decode(ds, compressed_image_paths, compression):
443 442
    with ds:
444 443
        ds.create_tensor("image", sample_compression=compression)
445 444
        ds.image.extend(
@@ -448,10 +447,10 @@
Loading
448 447
        ds.image.extend([deeplake.read(compressed_image_paths["jpeg"][0])] * 5)
449 448
    if isinstance(get_base_storage(ds.storage), (MemoryProvider, GCSProvider)):
450 449
        with pytest.raises(ValueError):
451 -
            dl = dataloader(ds)
450 +
            dl = ds.dataloader()
452 451
        return
453 452
454 -
    ptds = dataloader(ds).pytorch(tobytes=["image"])
453 +
    ptds = ds.dataloader().pytorch(decode_method={"image": "tobytes"})
455 454
456 455
    for i, batch in enumerate(ptds):
457 456
        image = batch["image"][0]
@@ -465,6 +464,19 @@
Loading
465 464
            with open(compressed_image_paths["jpeg"][0], "rb") as f:
466 465
                assert f.read() == image
467 466
467 +
    if compression:
468 +
        ptds = ds.dataloader().numpy(decode_method={"image": "pil"})
469 +
        for i, batch in enumerate(ptds):
470 +
            image = batch[0]["image"]
471 +
            assert isinstance(image, Image.Image)
472 +
            if i < 5:
473 +
                np.testing.assert_array_equal(
474 +
                    np.array(image), i * np.ones((10, 10, 3), dtype=np.uint8)
475 +
                )
476 +
            elif i >= 5:
477 +
                with Image.open(compressed_image_paths["jpeg"][0]) as f:
478 +
                    np.testing.assert_array_equal(np.array(f), np.array(image))
479 +
468 480
469 481
@requires_torch
470 482
@requires_libdeeplake
@@ -476,7 +488,7 @@
Loading
476 488
        ds.rename_tensor("abc", "xyz")
477 489
        ds.rename_group("blue", "red")
478 490
        ds["red/green"].append([1, 2, 3, 4])
479 -
    loader = dataloader(ds).pytorch(return_index=False)
491 +
    loader = ds.dataloader().pytorch(return_index=False)
480 492
    for sample in loader:
481 493
        assert set(sample.keys()) == {"xyz", "red/green"}
482 494
        np.testing.assert_array_equal(np.array(sample["xyz"]), np.array([[1, 2, 3]]))
@@ -489,7 +501,7 @@
Loading
489 501
@requires_libdeeplake
490 502
def test_expiration_date_casting_to_string():
491 503
    ds = deeplake.dataset("hub://activeloop/cifar100-train")[0:10:2]
492 -
    loader = dataloader(ds).pytorch(return_index=False)
504 +
    loader = ds.dataloader().pytorch(return_index=False)
493 505
    for _ in loader:
494 506
        pass
495 507
@@ -505,7 +517,7 @@
Loading
505 517
            ds.xyz.append(i * np.ones((2, 2)))
506 518
507 519
    ptds = (
508 -
        dataloader(local_ds)
520 +
        local_ds.dataloader()
509 521
        .batch(4)
510 522
        .pytorch(num_workers=num_workers, return_index=True)
511 523
    )
@@ -529,7 +541,7 @@
Loading
529 541
            ds.xyz.append(i * np.ones((2, 2)))
530 542
531 543
    ptds = (
532 -
        dataloader(local_ds)
544 +
        local_ds.dataloader()
533 545
        .batch(4)
534 546
        .transform(index_transform)
535 547
        .pytorch(num_workers=num_workers, return_index=True)
@@ -557,7 +569,7 @@
Loading
557 569
            ds.xyz.append(i * np.ones((2, 2)))
558 570
559 571
    ptds = (
560 -
        dataloader(local_ds)
572 +
        local_ds.dataloader()
561 573
        .batch(4)
562 574
        .transform({"xyz": double, "index": None})
563 575
        .pytorch(num_workers=num_workers, return_index=True)
@@ -571,7 +583,7 @@
Loading
571 583
            np.testing.assert_array_equal(2 * batch["index"][i], batch["xyz"][i][0, 0])
572 584
573 585
    ptds = (
574 -
        dataloader(local_ds)
586 +
        local_ds.dataloader()
575 587
        .batch(4)
576 588
        .transform({"xyz": double})
577 589
        .pytorch(num_workers=num_workers, return_index=True)
@@ -595,7 +607,7 @@
Loading
595 607
596 608
    with pytest.raises(ValueError):
597 609
        ptds = (
598 -
            dataloader(local_ds)
610 +
            local_ds.dataloader()
599 611
            .batch(4)
600 612
            .pytorch(
601 613
                num_workers=num_workers, return_index=True, tensors=["xyz", "index"]
@@ -603,7 +615,7 @@
Loading
603 615
        )
604 616
605 617
    ptds = (
606 -
        dataloader(local_ds)
618 +
        local_ds.dataloader()
607 619
        .batch(4)
608 620
        .pytorch(num_workers=num_workers, return_index=True, tensors=["xyz"])
609 621
    )
@@ -622,7 +634,7 @@
Loading
622 634
        ds.create_tensor("y")
623 635
        ds.x.extend(list(range(5)))
624 636
        ds.y.extend(list(range(10)))
625 -
    ptds = dataloader(ds).pytorch()
637 +
    ptds = ds.dataloader().pytorch()
626 638
    for i, batch in enumerate(ptds):
627 639
        x, y = np.array(batch["x"][0]), np.array(batch["y"][0])
628 640
        np.testing.assert_equal(x, i)
@@ -637,16 +649,16 @@
Loading
637 649
        ds.create_tensor("y")
638 650
        ds.x.extend(list(range(5)))
639 651
640 -
    ptds = dataloader(ds).pytorch()
652 +
    ptds = ds.dataloader().pytorch()
641 653
    with pytest.raises(EmptyTensorError):
642 654
        for _ in ptds:
643 655
            pass
644 656
645 -
    ptds = dataloader(ds).pytorch(tensors=["x", "y"])
657 +
    ptds = ds.dataloader().pytorch(tensors=["x", "y"])
646 658
    with pytest.raises(EmptyTensorError):
647 659
        for _ in ptds:
648 660
            pass
649 661
650 -
    ptds = dataloader(ds).pytorch(tensors=["x"])
662 +
    ptds = ds.dataloader().pytorch(tensors=["x"])
651 663
    for _ in ptds:
652 664
        pass
653 665
imilarity index 100%
654 666
ename from deeplake/experimental/test_query.py
655 667
ename to deeplake/enterprise/test_query.py
656 668
imilarity index 93%
657 669
ename from deeplake/experimental/util.py
658 670
ename to deeplake/enterprise/util.py

@@ -1,17 +1,17 @@
Loading
1 -
from typing import Callable, Dict, List, Optional, Union, Sequence
2 -
from deeplake.experimental.convert_to_libdeeplake import dataset_to_libdeeplake  # type: ignore
3 -
from deeplake.experimental.util import (
1 +
from typing import Callable, Dict, List, Optional, Union
2 +
from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake  # type: ignore
3 +
from deeplake.enterprise.util import (
4 4
    create_fetching_schedule,
5 5
    find_primary_tensor,
6 6
    raise_indra_installation_error,
7 7
    verify_base_storage,
8 8
)
9 -
from deeplake.experimental.util import collate_fn as default_collate  # type: ignore
10 -
from deeplake.experimental.libdeeplake_query import query, sample_by
9 +
from deeplake.enterprise.libdeeplake_query import query, sample_by
11 10
from deeplake.integrations.pytorch.common import (
12 11
    PytorchTransformFunction,
13 12
    check_tensors,
14 -
    remove_intersections,
13 +
    get_collate_fn,
14 +
    validate_decode_method,
15 15
)
16 16
from deeplake.util.bugout_reporter import deeplake_reporter
17 17
from deeplake.util.dataset import map_tensor_keys
@@ -58,7 +58,7 @@
Loading
58 58
        _return_index=None,
59 59
        _primary_tensor_name=None,
60 60
        _buffer_size=None,
61 -
        _tobytes=None,
61 +
        _decode_method=None,
62 62
    ):
63 63
        import_indra_loader()
64 64
        self.dataset = dataset
@@ -76,7 +76,7 @@
Loading
76 76
        self._return_index = _return_index
77 77
        self._primary_tensor_name = _primary_tensor_name or find_primary_tensor(dataset)
78 78
        self._buffer_size = _buffer_size
79 -
        self._tobytes = _tobytes
79 +
        self._decode_method = _decode_method
80 80
81 81
    def batch(self, batch_size: int, drop_last: bool = False):
82 82
        """Returns a batched :class:`DeepLakeDataLoader` object.
@@ -176,14 +176,12 @@
Loading
176 176
177 177
        Examples:
178 178
            >>> import deeplake
179 -
            >>> from deeplake.experimental import dataloader
180 179
            >>> ds = deeplake.load('hub://activeloop/fashion-mnist-train')
181 -
            >>> query_ds_train = dataloader(ds_train).query("select * where labels != 5")
180 +
            >>> query_ds_train = ds_train.dataloader().query("select * where labels != 5")
182 181
183 182
            >>> import deeplake
184 -
            >>> from deeplake.experimental import query
185 183
            >>> ds_train = deeplake.load('hub://activeloop/coco-train')
186 -
            >>> query_ds_train = dataloader(ds_train).query("(select * where contains(categories, 'car') limit 1000) union (select * where contains(categories, 'motorcycle') limit 1000)")
184 +
            >>> query_ds_train = ds_train.dataloader().query("(select * where contains(categories, 'car') limit 1000) union (select * where contains(categories, 'motorcycle') limit 1000)")
187 185
        """
188 186
        all_vars = self.__dict__.copy()
189 187
        all_vars["dataset"] = query(self.dataset, query_string)
@@ -212,20 +210,21 @@
Loading
212 210
            Sample the dataloader with ``labels == 5`` twice more than ``labels == 6``
213 211
214 212
            >>> ds = deeplake.load('hub://activeloop/fashion-mnist-train')
215 -
            >>> sampled_ds = dataloader(ds).sample_by("max_weight(labels == 5: 10, labels == 6: 5)")
213 +
            >>> sampled_ds = ds.dataloader().sample_by("max_weight(labels == 5: 10, labels == 6: 5)")
216 214
217 215
            Sample the dataloader treating `labels` tensor as weights.
218 216
219 217
            >>> ds = deeplake.load('hub://activeloop/fashion-mnist-train')
220 -
            >>> sampled_ds = dataloader(ds).sample_by("labels")
218 +
            >>> sampled_ds = ds.dataloader().sample_by("labels")
221 219
222 220
            Sample the dataloader with the given weights;
223 221
224 222
            >>> ds_train = deeplake.load('hub://activeloop/coco-train')
225 223
            >>> weights = list()
226 224
            >>> for i in range(0, len(ds_train)):
227 -
            >>>     weights.append(i % 5)
228 -
            >>> sampled_ds = dataloader(ds).sample_by(weights, replace=False)
225 +
            ...     weights.append(i % 5)
226 +
            ...
227 +
            >>> sampled_ds = ds.dataloader().sample_by(weights, replace=False)
229 228
230 229
        """
231 230
        all_vars = self.__dict__.copy()
@@ -244,7 +243,7 @@
Loading
244 243
        prefetch_factor: int = 2,
245 244
        distributed: bool = False,
246 245
        return_index: bool = True,
247 -
        tobytes: Union[bool, Sequence[str]] = False,
246 +
        decode_method: Optional[Dict[str, str]] = None,
248 247
    ):
249 248
        """Returns a :class:`DeepLakeDataLoader` object.
250 249
@@ -252,27 +251,35 @@
Loading
252 251
        Args:
253 252
            num_workers (int): Number of workers to use for transforming and processing the data. Defaults to 0.
254 253
            collate_fn (Callable, Optional): merges a list of samples to form a mini-batch of Tensor(s).
255 -
            tensors (List[str], Optional): List of tensors to load. If None, all tensors are loaded. Defaults to None.
256 -
            num_threads (int, Optional): Number of threads to use for fetching and decompressing the data. If None, the number of threads is automatically determined. Defaults to None.
254 +
            tensors (List[str], Optional): List of tensors to load. If None, all tensors are loaded. Defaults to ``None``.
255 +
            num_threads (int, Optional): Number of threads to use for fetching and decompressing the data. If ``None``, the number of threads is automatically determined. Defaults to ``None``.
257 256
            prefetch_factor (int): Number of batches to transform and collate in advance per worker. Defaults to 2.
258 -
            distributed (bool): Used for DDP training. Distributes different sections of the dataset to different ranks. Defaults to False.
259 -
            return_index (bool): Used to identify where loader needs to return sample index or not. Defaults to True.
260 -
            tobytes (bool, Sequence[str]): If ``True``, samples will not be decompressed and their raw bytes will be returned instead of numpy arrays. Can also be a list of tensors, in which case those tensors alone will not be decompressed.
257 +
            distributed (bool): Used for DDP training. Distributes different sections of the dataset to different ranks. Defaults to ``False``.
258 +
            return_index (bool): Used to idnetify where loader needs to retur sample index or not. Defaults to ``True``.
259 +
            decode_method (Dict[str, str], Optional): A dictionary of decode methods for each tensor. Defaults to ``None``.
260 +
261 +
                - Supported decode methods are:
262 +
263 +
                    :'numpy': Default behaviour. Returns samples as numpy arrays.
264 +
                    :'tobytes': Returns raw bytes of the samples.
265 +
                    :'pil': Returns samples as PIL images. Especially useful when transformation use torchvision transforms, that
266 +
                            require PIL images as input. Only supported for tensors with sample_compression='jpeg' or 'png'.
261 267
262 268
        Returns:
263 269
            DeepLakeDataLoader: A :class:`DeepLakeDataLoader` object.
264 270
265 271
        Raises:
266 -
            ValueError: If .to_pytorch() or .to_numpy() has already been called.
272 +
            ValueError: If .pytorch() or .numpy() has already been called.
267 273
        """
268 274
        if self._mode is not None:
269 275
            if self._mode == "numpy":
270 -
                raise ValueError("Can't call .to_pytorch after .to_numpy()")
271 -
            raise ValueError("already called .to_pytorch()")
276 +
                raise ValueError("Can't call .pytorch after .numpy()")
277 +
            raise ValueError("already called .pytorch()")
272 278
        all_vars = self.__dict__.copy()
273 279
        all_vars["_num_workers"] = num_workers
274 280
        all_vars["_collate"] = collate_fn
275 -
        handle_tensors_and_tobytes(tensors, tobytes, self.dataset, all_vars)
281 +
        validate_tensors(tensors, self.dataset, all_vars)
282 +
        all_vars["_decode_method"] = decode_method
276 283
        all_vars["_num_threads"] = num_threads
277 284
        all_vars["_prefetch_factor"] = prefetch_factor
278 285
        all_vars["_distributed"] = distributed
@@ -287,7 +294,7 @@
Loading
287 294
        tensors: Optional[List[str]] = None,
288 295
        num_threads: Optional[int] = None,
289 296
        prefetch_factor: int = 2,
290 -
        tobytes: Union[bool, Sequence[str]] = False,
297 +
        decode_method: Optional[Dict[str, str]] = None,
291 298
    ):
292 299
        """Returns a :class:`DeepLakeDataLoader` object.
293 300
@@ -296,21 +303,28 @@
Loading
296 303
            tensors (List[str], Optional): List of tensors to load. If None, all tensors are loaded. Defaults to None.
297 304
            num_threads (int, Optional): Number of threads to use for fetching and decompressing the data. If None, the number of threads is automatically determined. Defaults to None.
298 305
            prefetch_factor (int): Number of batches to transform and collate in advance per worker. Defaults to 2.
299 -
            tobytes (bool, Sequence[str]): If ``True``, samples will not be decompressed and their raw bytes will be returned instead of numpy arrays. Can also be a list of tensors, in which case those tensors alone will not be decompressed.
306 +
            decode_method (Dict[str, str], Optional): A dictionary of decode methods for each tensor. Defaults to None.
307 +
308 +
                - Supported decode methods are:-
309 +
310 +
                    :'numpy': Default behaviour. Returns samples as numpy arrays.
311 +
                    :'tobytes': Returns raw bytes of the samples.
312 +
                    :'pil': Returns samples as PIL images. Especially useful when transformation use torchvision transforms, that require PIL images as input. Only supported for tensors with sample_compression='jpeg' or 'png'.
300 313
301 314
        Returns:
302 315
            DeepLakeDataLoader: A :class:`DeepLakeDataLoader` object.
303 316
304 317
        Raises:
305 -
            ValueError: If .to_pytorch() or .to_numpy() has already been called.
318 +
            ValueError: If .pytorch() or .numpy() has already been called.
306 319
        """
307 320
        if self._mode is not None:
308 321
            if self._mode == "pytorch":
309 -
                raise ValueError("Can't call .to_numpy after .to_pytorch()")
310 -
            raise ValueError("already called .to_numpy()")
322 +
                raise ValueError("Can't call .numpy after .pytorch()")
323 +
            raise ValueError("already called .numpy()")
311 324
        all_vars = self.__dict__.copy()
312 325
        all_vars["_num_workers"] = num_workers
313 -
        handle_tensors_and_tobytes(tensors, tobytes, self.dataset, all_vars)
326 +
        validate_tensors(tensors, self.dataset, all_vars)
327 +
        all_vars["_decode_method"] = decode_method
314 328
        all_vars["_tensors"] = self._tensors or tensors
315 329
        all_vars["_num_threads"] = num_threads
316 330
        all_vars["_prefetch_factor"] = prefetch_factor
@@ -318,62 +332,35 @@
Loading
318 332
        return self.__class__(**all_vars)
319 333
320 334
    def __iter__(self):
321 -
        tensors = self._tensors or map_tensor_keys(self.dataset, None)
322 -
323 -
        # uncompressed tensors will be uncompressed in the workers on python side
324 -
        compressed_tensors = check_tensors(self.dataset, tensors)
325 -
        dataset = dataset_to_libdeeplake(self.dataset)
326 -
        batch_size = self._batch_size or 1
327 -
        drop_last = self._drop_last or False
328 -
        return_index = self._return_index
329 -
        if return_index is None:
330 -
            return_index = True
331 -
332 -
        shuffle = self._shuffle
333 -
        if shuffle is None:
334 -
            shuffle = False
335 -
336 -
        transform_fn = self._transform
337 -
338 -
        num_workers = self._num_workers or 0
339 -
        if self._collate is None and self._mode == "pytorch":
340 -
            collate_fn = default_collate
341 -
        else:
342 -
            collate_fn = self._collate
343 -
        num_threads = self._num_threads
344 -
        prefetch_factor = self._prefetch_factor
345 -
        distributed = self._distributed or False
346 -
347 -
        # only upcast for pytorch, this handles unsupported dtypes
348 -
        upcast = self._mode == "pytorch"
335 +
        collate_fn = get_collate_fn(self._collate, self._mode)
336 +
        upcast = self._mode == "pytorch"  # upcast to handle unsupported dtypes
349 337
350 338
        primary_tensor_name = self._primary_tensor_name
351 339
        buffer_size = self._buffer_size
352 -
        if self._tobytes is True:
353 -
            raw_tensors = tensors
354 -
        elif self._tobytes is False:
355 -
            raw_tensors = []
356 -
        else:
357 -
            raw_tensors = self._tobytes
358 340
359 -
        compressed_tensors, raw_tensors = remove_intersections(
360 -
            compressed_tensors, raw_tensors
341 +
        tensors = self._tensors or map_tensor_keys(self.dataset, None)
342 +
        dataset = dataset_to_libdeeplake(self.dataset)
343 +
344 +
        jpeg_png_compressed_tensors = check_tensors(self.dataset, tensors)
345 +
        raw_tensors, compressed_tensors = validate_decode_method(
346 +
            self._decode_method, tensors, jpeg_png_compressed_tensors
361 347
        )
348 +
        raw_tensors.extend(compressed_tensors)
362 349
        return iter(
363 350
            INDRA_LOADER(
364 351
                dataset,
365 -
                batch_size=batch_size,
366 -
                num_threads=num_threads,
367 -
                shuffle=shuffle,
368 -
                num_workers=num_workers,
352 +
                batch_size=self._batch_size,
353 +
                num_threads=self._num_threads,
354 +
                shuffle=self._shuffle,
355 +
                num_workers=self._num_workers,
369 356
                collate_fn=collate_fn,
370 -
                transform_fn=transform_fn,
371 -
                distributed=distributed,
372 -
                prefetch_factor=prefetch_factor,
357 +
                transform_fn=self._transform,
358 +
                distributed=self._distributed,
359 +
                prefetch_factor=self._prefetch_factor,
373 360
                tensors=tensors,
374 -
                drop_last=drop_last,
361 +
                drop_last=self._drop_last,
375 362
                upcast=upcast,
376 -
                return_index=return_index,
363 +
                return_index=self._return_index,
377 364
                primary_tensor=primary_tensor_name,
378 365
                buffer_size=buffer_size,
379 366
                raw_tensors=raw_tensors,
@@ -383,14 +370,14 @@
Loading
383 370
384 371
385 372
def dataloader(dataset) -> DeepLakeDataLoader:
386 -
    """Returns a :class:`~deeplake.experimental.dataloader.DeepLakeDataLoader` object which can be transformed to either pytorch dataloader or numpy.
373 +
    """Returns a :class:`~deeplake.enterprise.dataloader.DeepLakeDataLoader` object which can be transformed to either pytorch dataloader or numpy.
387 374
388 375
389 376
    Args:
390 377
        dataset: :class:`~deeplake.core.dataset.Dataset` object on which dataloader needs to be built
391 378
392 379
    Returns:
393 -
        DeepLakeDataLoader: A :class:`~deeplake.experimental.dataloader.DeepLakeDataLoader` object.
380 +
        DeepLakeDataLoader: A :class:`~deeplake.enterprise.dataloader.DeepLakeDataLoader` object.
394 381
395 382
396 383
    Examples:
@@ -400,7 +387,7 @@
Loading
400 387
401 388
402 389
        >>> import deeplake
403 -
        >>> from deeplake.experimental import dataloader
390 +
        >>> from deeplake.enterprise import dataloader
404 391
        >>>
405 392
        >>> ds_train = deeplake.load('hub://activeloop/fashion-mnist-train')
406 393
        >>> train_loader = dataloader(ds_train).numpy()
@@ -452,7 +439,7 @@
Loading
452 439
    return DeepLakeDataLoader(dataset)
453 440
454 441
455 -
def handle_tensors_and_tobytes(tensors, tobytes, dataset, all_vars):
442 +
def validate_tensors(tensors, dataset, all_vars):
456 443
    existing_tensors = all_vars["_tensors"]
457 444
    if tensors:
458 445
        if "index" in tensors:
@@ -465,11 +452,3 @@
Loading
465 452
                "Tensors have already been specified by passing a dictionary to .transform() method"
466 453
            )
467 454
    all_vars["_tensors"] = existing_tensors or tensors
468 -
    if isinstance(tobytes, Sequence):
469 -
        tobytes = map_tensor_keys(dataset, tobytes)
470 -
        if tobytes and all_vars["_tensors"]:
471 -
            tensor_set = set(all_vars["_tensors"])
472 -
            for tensor in tobytes:
473 -
                if tensor not in tensor_set:
474 -
                    raise ValueError(f"tobytes tensor {tensor} not found in tensors.")
475 -
    all_vars["_tobytes"] = tobytes
476 455
imilarity index 96%
477 456
ename from deeplake/experimental/libdeeplake_query.py
478 457
ename to deeplake/enterprise/libdeeplake_query.py

@@ -1344,7 +1344,7 @@
Loading
1344 1344
            The dictionary will always have 2 keys, "dataset" and "tensors". The values corresponding to these keys are detailed below:
1345 1345
1346 1346
                - If ``id_1`` and ``id_2`` are None, both the keys will have a single list as their value. This list will contain a dictionary describing changes compared to the previous commit.
1347 -
                - If only ``id_1`` is provided, both keys will have a tuple of 2 lists as their value. The lists will contain dictionaries describing commitwise differences between commits. The 2 lists will range from current state and ``id_1` to most recent common ancestor the commits respectively.
1347 +
                - If only ``id_1`` is provided, both keys will have a tuple of 2 lists as their value. The lists will contain dictionaries describing commitwise differences between commits. The 2 lists will range from current state and ``id_1`` to most recent common ancestor the commits respectively.
1348 1348
                - If only ``id_2`` is provided, a ValueError will be raised.
1349 1349
                - If both ``id_1`` and ``id_2`` are provided, both keys will have a tuple of 2 lists as their value. The lists will contain dictionaries describing commitwise differences between commits. The 2 lists will range from ``id_1`` and ``id_2`` to most recent common ancestor the commits respectively.
1350 1350
@@ -1464,7 +1464,6 @@
Loading
1464 1464
        self,
1465 1465
        transform: Optional[Callable] = None,
1466 1466
        tensors: Optional[Sequence[str]] = None,
1467 -
        tobytes: Union[bool, Sequence[str]] = False,
1468 1467
        num_workers: int = 1,
1469 1468
        batch_size: int = 1,
1470 1469
        drop_last: bool = False,
@@ -1477,29 +1476,37 @@
Loading
1477 1476
        return_index: bool = True,
1478 1477
        pad_tensors: bool = False,
1479 1478
        transform_kwargs: Optional[Dict[str, Any]] = None,
1479 +
        decode_method: Optional[Dict[str, str]] = None,
1480 1480
    ):
1481 1481
        """Converts the dataset into a pytorch Dataloader.
1482 1482
1483 1483
        Args:
1484 1484
            transform (Callable, Optional): Transformation function to be applied to each sample.
1485 -
            tensors (List, Optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label).
1486 -
            tobytes (bool): If ``True``, samples will not be decompressed and their raw bytes will be returned instead of numpy arrays. Can also be a list of tensors, in which case those tensors alone will not be decompressed.
1485 +
            tensors (List, Optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if ``tensors=["image", "label"]``, your training script should expect each batch will be provided as a tuple of (image, label).
1487 1486
            num_workers (int): The number of workers to use for fetching data in parallel.
1488 1487
            batch_size (int): Number of samples per batch to load. Default value is 1.
1489 1488
            drop_last (bool): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size.
1490 -
                if ``False`` and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Default value is False.
1489 +
                if ``False`` and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Default value is ``False``.
1491 1490
                Read torch.utils.data.DataLoader docs for more details.
1492 1491
            collate_fn (Callable, Optional): merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset.
1493 1492
                Read torch.utils.data.DataLoader docs for more details.
1494 -
            pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. Default value is False.
1493 +
            pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. Default value is ``False``.
1495 1494
                Read torch.utils.data.DataLoader docs for more details.
1496 -
            shuffle (bool): If ``True``, the data loader will shuffle the data indices. Default value is False. Details about how Deep Lake shuffles data can be found at https://docs.activeloop.ai/how-hub-works/shuffling-in-ds.pytorch
1495 +
            shuffle (bool): If ``True``, the data loader will shuffle the data indices. Default value is False. Details about how Deep Lake shuffles data can be found at `Shuffling in ds.pytorch() <https://docs.activeloop.ai/how-it-works/shuffling-in-ds.pytorch>`_
1497 1496
            buffer_size (int): The size of the buffer used to shuffle the data in MBs. Defaults to 2048 MB. Increasing the buffer_size will increase the extent of shuffling.
1498 -
            use_local_cache (bool): If ``True``, the data loader will use a local cache to store data. The default cache location is ~/.activeloop/cache, but it can be changed by setting the LOCAL_CACHE_PREFIX environment variable. This is useful when the dataset can fit on the machine and we don't want to fetch the data multiple times for each iteration. Default value is False
1497 +
            use_local_cache (bool): If ``True``, the data loader will use a local cache to store data. The default cache location is ~/.activeloop/cache, but it can be changed by setting the ``LOCAL_CACHE_PREFIX`` environment variable. This is useful when the dataset can fit on the machine and we don't want to fetch the data multiple times for each iteration. Default value is ``False``
1499 1498
            use_progress_bar (bool): If ``True``, tqdm will be wrapped around the returned dataloader. Default value is True.
1500 1499
            return_index (bool): If ``True``, the returned dataloader will have a key "index" that contains the index of the sample(s) in the original dataset. Default value is True.
1501 1500
            pad_tensors (bool): If ``True``, shorter tensors will be padded to the length of the longest tensor. Default value is False.
1502 -
            transform_kwargs (optional, Dict[str, Any]): Additional kwargs to be passed to `transform`.
1501 +
            transform_kwargs (optional, Dict[str, Any]): Additional kwargs to be passed to ``transform``.
1502 +
            decode_method (Dict[str, str], Optional): A dictionary of decode methods for each tensor. Defaults to ``None``.
1503 +
1504 +
                - Supported decode methods are:
1505 +
1506 +
                    :'numpy': Default behaviour. Returns samples as numpy arrays.
1507 +
                    :'tobytes': Returns raw bytes of the samples.
1508 +
                    :'pil': Returns samples as PIL images. Especially useful when transformation use torchvision transforms, that
1509 +
                            require PIL images as input. Only supported for tensors with ``sample_compression='jpeg'`` or ``'png'``.
1503 1510
1504 1511
        Returns:
1505 1512
            A torch.utils.data.DataLoader object.
@@ -1520,7 +1527,6 @@
Loading
1520 1527
            self,
1521 1528
            transform=transform,
1522 1529
            tensors=tensors,
1523 -
            tobytes=tobytes,
1524 1530
            num_workers=num_workers,
1525 1531
            batch_size=batch_size,
1526 1532
            drop_last=drop_last,
@@ -1531,6 +1537,7 @@
Loading
1531 1537
            use_local_cache=use_local_cache,
1532 1538
            return_index=return_index,
1533 1539
            pad_tensors=pad_tensors,
1540 +
            decode_method=decode_method,
1534 1541
        )
1535 1542
1536 1543
        if use_progress_bar:
@@ -1538,6 +1545,74 @@
Loading
1538 1545
        dataset_read(self)
1539 1546
        return dataloader
1540 1547
1548 +
    def dataloader(self):
1549 +
        """Returns a :class:`~deeplake.enterprise.DeepLakeDataLoader` object. To use this, install deeplake with ``pip install deeplake[enterprise]``.
1550 +
1551 +
        Returns:
1552 +
            ~deeplake.enterprise.DeepLakeDataLoader: A :class:`deeplake.enterprise.DeepLakeDataLoader` object.
1553 +
        
1554 +
        Examples:
1555 +
1556 +
            Creating a simple dataloader object which returns a batch of numpy arrays
1557 +
1558 +
            >>> import deeplake
1559 +
            >>> ds_train = deeplake.load('hub://activeloop/fashion-mnist-train')
1560 +
            >>> train_loader = ds_train.dataloader().numpy()
1561 +
            >>> for i, data in enumerate(train_loader):
1562 +
            ...     # custom logic on data
1563 +
            ...     pass
1564 +
1565 +
1566 +
            Creating dataloader with custom transformation and batch size
1567 +
1568 +
            >>> import deeplake
1569 +
            >>> import torch
1570 +
            >>> from torchvision import datasets, transforms, models
1571 +
            >>> 
1572 +
            >>> ds_train = deeplake.load('hub://activeloop/fashion-mnist-train')
1573 +
            >>> tform = transforms.Compose([
1574 +
            ...     transforms.ToPILImage(), # Must convert to PIL image for subsequent operations to run
1575 +
            ...     transforms.RandomRotation(20), # Image augmentation
1576 +
            ...     transforms.ToTensor(), # Must convert to pytorch tensor for subsequent operations to run
1577 +
            ...     transforms.Normalize([0.5], [0.5]),
1578 +
            ... ])
1579 +
            ...
1580 +
            >>> batch_size = 32
1581 +
            >>> # create dataloader by chaining with transform function and batch size and returns batch of pytorch tensors
1582 +
            >>> train_loader = ds_train.dataloader()\\
1583 +
            ...     .transform({'images': tform, 'labels': None})\\
1584 +
            ...     .batch(batch_size)\\
1585 +
            ...     .shuffle()\\
1586 +
            ...     .pytorch()
1587 +
            ...
1588 +
            >>> # loop over the elements
1589 +
            >>> for i, data in enumerate(train_loader):
1590 +
            ...     # custom logic on data
1591 +
            ...     pass
1592 +
1593 +
            Creating dataloader and chaining with query
1594 +
1595 +
            >>> ds = deeplake.load('hub://activeloop/coco-train')
1596 +
            >>> train_loader = ds_train.dataloader()\\
1597 +
            ...     .query("(select * where contains(categories, 'car') limit 1000) union (select * where contains(categories, 'motorcycle') limit 1000)")\\
1598 +
            ...     .pytorch()
1599 +
            ...
1600 +
            >>> # loop over the elements
1601 +
            >>> for i, data in enumerate(train_loader):
1602 +
            ...     # custom logic on data
1603 +
            ...     pass
1604 +
1605 +
        **Restrictions**
1606 +
1607 +
        The new high performance C++ dataloader is part of our Growth and Enterprise Plan .
1608 +
1609 +
        - Users of our Community plan can create dataloaders on Activeloop datasets ("hub://activeloop/..." datasets).
1610 +
        - To run queries on your own datasets, `upgrade your organization's plan <https://www.activeloop.ai/pricing/>`_.
1611 +
        """
1612 +
        from deeplake.enterprise import dataloader
1613 +
1614 +
        return dataloader(self)
1615 +
1541 1616
    @deeplake_reporter.record_call
1542 1617
    def filter(
1543 1618
        self,
@@ -1591,7 +1666,7 @@
Loading
1591 1666
        return ret
1592 1667
1593 1668
    def query(self, query_string: str):
1594 -
        """Returns a sliced :class:`~deeplake.core.dataset.Dataset` with given query results.
1669 +
        """Returns a sliced :class:`~deeplake.core.dataset.Dataset` with given query results. To use this, install deeplake with ``pip install deeplake[enterprise]``.
1595 1670
1596 1671
        It allows to run SQL like queries on dataset and extract results. See supported keywords and the Tensor Query Language documentation
1597 1672
        :ref:`here <tql>`.
@@ -1616,8 +1691,15 @@
Loading
1616 1691
1617 1692
            >>> ds_train = deeplake.load('hub://activeloop/coco-train')
1618 1693
            >>> query_ds_train = ds_train.query("(select * where contains(categories, 'car') limit 1000) union (select * where contains(categories, 'motorcycle') limit 1000)")
1694 +
1695 +
        **Restrictions**
1696 +
1697 +
        Querying datasets is part of our Growth and Enterprise Plan .
1698 +
1699 +
        - Users of our Community plan can only perform queries on Activeloop datasets ("hub://activeloop/..." datasets).
1700 +
        - To run queries on your own datasets, `upgrade your organization's plan <https://www.activeloop.ai/pricing/>`_.
1619 1701
        """
1620 -
        from deeplake.experimental import query
1702 +
        from deeplake.enterprise import query
1621 1703
1622 1704
        return query(self, query_string)
1623 1705
@@ -1627,14 +1709,13 @@
Loading
1627 1709
        replace: Optional[bool] = True,
1628 1710
        size: Optional[int] = None,
1629 1711
    ):
1630 -
        """Returns a sliced :class:`~deeplake.core.dataset.Dataset` with given weighted sampler applied
1712 +
        """Returns a sliced :class:`~deeplake.core.dataset.Dataset` with given weighted sampler applied.
1713 +
        To use this, install deeplake with ``pip install deeplake[enterprise]``.
1631 1714
1632 1715
        Args:
1633 -
            weights: (Union[str, list, tuple]): If it's string then tql will be run to calculate the weights based on the expression. list and tuple will be treated as the list of the weights per sample
1634 -
            replace: Optional[bool] If true the samples can be repeated in the result view.
1635 -
                (default: ``True``).
1636 -
            size: Optional[int] The length of the result view.
1637 -
                (default: ``len(dataset)``)
1716 +
            weights: (Union[str, list, tuple]): If it's string then tql will be run to calculate the weights based on the expression. list and tuple will be treated as the list of the weights per sample.
1717 +
            replace: Optional[bool] If true the samples can be repeated in the result view. Defaults to ``True``
1718 +
            size: Optional[int] The length of the result view. Defaults to length of the dataset.
1638 1719
1639 1720
1640 1721
        Returns:
@@ -1651,19 +1732,26 @@
Loading
1651 1732
            Sample the dataset treating `labels` tensor as weights.
1652 1733
1653 1734
            >>> import deeplake
1654 -
            >>> from deeplake.experimental import query
1655 1735
            >>> ds = deeplake.load('hub://activeloop/fashion-mnist-train')
1656 -
            >>> sampled_ds = ds.sample_by("labels")
1736 +
            >>> sampled_ds = ds.sample_by("max_weight(labels == 5: 10, labels == 6: 5"))
1657 1737
1658 1738
            Sample the dataset with the given weights;
1659 1739
1660 1740
            >>> ds = deeplake.load('hub://activeloop/coco-train')
1661 1741
            >>> weights = list()
1662 -
            >>> for i in range(0, len(ds)):
1663 -
            >>>     weights.append(i % 5)
1742 +
            >>> for i in range(len(ds)):
1743 +
            ...     weights.append(i % 5)
1744 +
            ...
1664 1745
            >>> sampled_ds = ds.sample_by(weights, replace=False)
1746 +
1747 +
        **Restrictions**
1748 +
1749 +
        Querying datasets is part of our Growth and Enterprise Plan .
1750 +
1751 +
        - Users of our Community plan can only use ``sample_by`` on Activeloop datasets ("hub://activeloop/..." datasets).
1752 +
        - To use sampling functionality on your own datasets, `upgrade your organization's plan <https://www.activeloop.ai/pricing/>`_.
1665 1753
        """
1666 -
        from deeplake.experimental import sample_by
1754 +
        from deeplake.enterprise import sample_by
1667 1755
1668 1756
        return sample_by(self, weights, replace, size)
1669 1757

@@ -24,6 +24,7 @@
Loading
24 24
    LocalProvider,
25 25
)
26 26
from deeplake.core.tiling.deserialize import combine_chunks
27 +
from deeplake.integrations.pytorch.common import check_tensors, validate_decode_method
27 28
from deeplake.util.exceptions import (
28 29
    DatasetUnsupportedPytorch,
29 30
    SampleDecompressionError,
@@ -31,6 +32,7 @@
Loading
31 32
from deeplake.util.keys import get_chunk_key, get_tensor_meta_key
32 33
from deeplake.util.remove_cache import get_base_storage
33 34
from deeplake.util.storage import get_pytorch_local_storage
35 +
from PIL import Image  # type: ignore
34 36
35 37
36 38
ChunkEngineMap = Dict[str, ChunkEngine]
@@ -261,10 +263,11 @@
Loading
261 263
        self,
262 264
        dataset,
263 265
        tensors: Sequence[str],
264 -
        tobytes: Union[bool, Sequence[str]] = False,
265 266
        use_local_cache: bool = False,
266 267
        return_index: bool = True,
267 268
        pad_tensors: bool = False,
269 +
        decode_method: Optional[Dict[str, str]] = None,
270 +
        tobytes: Union[bool, Sequence[str]] = False,
268 271
    ) -> None:
269 272
        super().__init__()
270 273
@@ -282,15 +285,15 @@
Loading
282 285
283 286
        self.tensors = tensors
284 287
        self.pad_tensors = pad_tensors
285 -
        if isinstance(tobytes, bool):
286 -
            self.tobytes = {k: tobytes for k in self.tensors}
287 -
        else:
288 -
            for k in tobytes:
289 -
                if k not in tensors:
290 -
                    raise Exception(
291 -
                        f"Tensor {k} is not present in the list of provided tensors: {tensors}."
292 -
                    )
293 -
            self.tobytes = {k: k in tobytes for k in tensors}
288 +
        self.decode_method = decode_method
289 +
        self.return_index = return_index
290 +
291 +
        jpeg_png_compressed_tensors = check_tensors(self.dataset, tensors)
292 +
        raw_tensors, compressed_tensors = validate_decode_method(
293 +
            self.decode_method, tensors, jpeg_png_compressed_tensors
294 +
        )
295 +
        self.raw_tensors = set(raw_tensors)
296 +
        self.compressed_tensors = set(compressed_tensors)
294 297
295 298
        self.chunk_engines: ChunkEngineMap = self._map_chunk_engines(self.tensors)
296 299
@@ -300,8 +303,6 @@
Loading
300 303
            else None
301 304
        )
302 305
303 -
        self.return_index = return_index
304 -
305 306
    def read(self, schedule: Schedule) -> Iterator:
306 307
        for block in schedule._blocks:
307 308
            yield from self.stream(block)
@@ -313,7 +314,8 @@
Loading
313 314
            valid_sample_flag = True
314 315
315 316
            for keyid, (key, engine) in enumerate(self.chunk_engines.items()):
316 -
                decompress = not self.tobytes[key]
317 +
                decompress = key not in self.raw_tensors
318 +
                to_pil = key in self.compressed_tensors
317 319
                chunk_class = engine.chunk_class
318 320
                try:
319 321
@@ -347,7 +349,7 @@
Loading
347 349
                        chunks.append(chunk)
348 350
                    if len(chunks) == 1:
349 351
                        data = engine.read_sample_from_chunk(
350 -
                            idx, chunk, decompress=decompress
352 +
                            idx, chunk, decompress=decompress, to_pil=to_pil
351 353
                        )
352 354
                    else:
353 355
                        if not decompress:
@@ -355,6 +357,8 @@
Loading
355 357
                                "`tobytes=True` is not supported by tiled samples as it can cause recompression."
356 358
                            )
357 359
                        data = combine_chunks(chunks, idx, engine.tile_encoder)
360 +
                        if to_pil:
361 +
                            data = Image.fromarray(data)  # type: ignore
358 362
359 363
                    if data is not None:
360 364
                        sample[key] = data

@@ -1, +1,6 @@
Loading
1 -
from deeplake.core.transform.transform import compute, compose
1 +
from deeplake.core.transform.transform import (
2 +
    compute,
3 +
    compose,
4 +
    ComputeFunction,
5 +
    Pipeline,
6 +
)

@@ -12,7 +12,6 @@
Loading
12 12
def create_dataloader_nesteddataloader(
13 13
    dataset,
14 14
    tensors,
15 -
    tobytes,
16 15
    use_local_cache,
17 16
    transform,
18 17
    num_workers,
@@ -23,6 +22,7 @@
Loading
23 22
    drop_last,
24 23
    return_index,
25 24
    pad_tensors,
25 +
    decode_method,
26 26
):
27 27
    import torch
28 28
    import torch.utils.data
@@ -34,7 +34,6 @@
Loading
34 34
        SubIterableDataset(
35 35
            dataset,
36 36
            tensors=tensors,
37 -
            tobytes=tobytes,
38 37
            use_local_cache=use_local_cache,
39 38
            transform=transform,
40 39
            batch_size=batch_size,
@@ -42,6 +41,7 @@
Loading
42 41
            buffer_size=buffer_size,
43 42
            return_index=return_index,
44 43
            pad_tensors=pad_tensors,
44 +
            decode_method=decode_method,
45 45
        ),
46 46
        batch_size=batch_size,
47 47
        collate_fn=collate_fn,
@@ -101,9 +101,9 @@
Loading
101 101
    use_local_cache: bool,
102 102
    transform: Optional[Union[Dict, Callable]] = None,
103 103
    tensors: Optional[Sequence[str]] = None,
104 -
    tobytes: Union[bool, Sequence[str]] = False,
105 104
    return_index: bool = True,
106 105
    pad_tensors: bool = True,
106 +
    decode_method: Optional[Dict[str, str]] = None,
107 107
):
108 108
109 109
    import torch
@@ -126,13 +126,10 @@
Loading
126 126
    else:
127 127
        transform = PytorchTransformFunction(composite_transform=transform)
128 128
129 -
    check_tensors(dataset, tensors)
130 -
131 129
    if shuffle and num_workers > 0:
132 130
        return create_dataloader(
133 131
            dataset,
134 132
            tensors,
135 -
            tobytes,
136 133
            use_local_cache,
137 134
            transform,
138 135
            num_workers,
@@ -143,13 +140,13 @@
Loading
143 140
            drop_last,
144 141
            return_index,
145 142
            pad_tensors,
143 +
            decode_method,
146 144
        )
147 145
    else:
148 146
        return torch.utils.data.DataLoader(
149 147
            TorchDataset(
150 148
                dataset,
151 149
                tensors=tensors,
152 -
                tobytes=tobytes,
153 150
                use_local_cache=use_local_cache,
154 151
                transform=transform,
155 152
                num_workers=num_workers,
@@ -157,6 +154,7 @@
Loading
157 154
                buffer_size=buffer_size,
158 155
                return_index=return_index,
159 156
                pad_tensors=pad_tensors,
157 +
                decode_method=decode_method,
160 158
            ),
161 159
            batch_size=batch_size,
162 160
            collate_fn=collate_fn,

@@ -1,4 +1,4 @@
Loading
1 -
from deeplake.experimental.util import raise_indra_installation_error, remove_tiled_samples  # type: ignore
1 +
from deeplake.enterprise.util import raise_indra_installation_error, remove_tiled_samples  # type: ignore
2 2
from deeplake.core.storage import S3Provider
3 3
4 4
from deeplake.util.dataset import try_flushing  # type: ignore
5 5
imilarity index 74%
6 6
ename from deeplake/experimental/dataloader.py
7 7
ename to deeplake/enterprise/dataloader.py

@@ -65,6 +65,7 @@
Loading
65 65
        stream: bool = False,
66 66
        decompress: bool = True,
67 67
        is_tile: bool = False,
68 +
        to_pil: bool = False,
68 69
    ):
69 70
        if self.is_empty_tensor:
70 71
            raise EmptyTensorError(
@@ -129,7 +130,10 @@
Loading
129 130
            end_idx=stop,
130 131
            step=step,
131 132
            reverse=reverse,
133 +
            to_pil=to_pil,
132 134
        )
135 +
        if to_pil:
136 +
            return sample
133 137
134 138
        if squeeze:
135 139
            sample = sample.squeeze(0)

@@ -11,7 +11,7 @@
Loading
11 11
def raise_indra_installation_error(indra_import_error: Optional[Exception] = None):
12 12
    if not indra_import_error:
13 13
        raise ImportError(
14 -
            "This is an experimental feature that requires libdeeplake package. libdeeplake is available only on linux for python versions 3.6 through 3.10 and on macos for python versions 3.7 through 3.10"
14 +
            "This is an enterprise feature that requires libdeeplake package which can be installed using pip install deeplake[enterprise]. libdeeplake is available only on linux for python versions 3.6 through 3.10 and on macos for python versions 3.7 through 3.10"
15 15
        )
16 16
    raise ImportError(
17 17
        "Error while importing C++ backend. One of the dependencies might not be installed."

@@ -1,6 +1,6 @@
Loading
1 -
from typing import Optional, Union
2 -
from deeplake.experimental.convert_to_libdeeplake import dataset_to_libdeeplake
1 +
from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake
3 2
from deeplake.util.bugout_reporter import deeplake_reporter
3 +
from typing import Optional, Union
4 4
5 5
import numpy as np
6 6
@@ -26,7 +26,7 @@
Loading
26 26
        Query from dataset all the samples with lables other than ``5``
27 27
28 28
        >>> import deeplake
29 -
        >>> from deeplake.experimental import query
29 +
        >>> from deeplake.enterprise import query
30 30
        >>> ds = deeplake.load('hub://activeloop/fashion-mnist-train')
31 31
        >>> query_ds_train = query(ds_train, "select * where labels != 5")
32 32
33 33
imilarity index 89%
34 34
ename from deeplake/experimental/test_pytorch.py
35 35
ename to deeplake/enterprise/test_pytorch.py

@@ -1,4 +1,3 @@
Loading
1 -
import io
2 1
from logging import warning
3 2
import deeplake
4 3
from deeplake.util.exceptions import (
@@ -18,7 +17,7 @@
Loading
18 17
from typing import Union, Tuple, Sequence, List, Optional, BinaryIO
19 18
import numpy as np
20 19
from pathlib import Path
21 -
from PIL import Image, UnidentifiedImageError  # type: ignore
20 +
from PIL import Image  # type: ignore
22 21
from io import BytesIO
23 22
24 23
import mmap
@@ -248,7 +247,8 @@
Loading
248 247
    end_idx: Optional[int] = None,
249 248
    step: Optional[int] = None,
250 249
    reverse: bool = False,
251 -
) -> np.ndarray:
250 +
    to_pil: bool = False,
251 +
) -> Union[np.ndarray, Image.Image]:
252 252
    """Decompress some buffer into a numpy array. It is expected that all meta information is
253 253
    stored inside `buffer`.
254 254
@@ -265,13 +265,14 @@
Loading
265 265
        end_idx: (int, Optional): Applicable only for video compressions. Index of last frame (exclusive).
266 266
        step: (int, Optional): Applicable only for video compressions. Step size for seeking.
267 267
        reverse (bool): Applicable only for video compressions. Reverses output numpy array if set to True.
268 +
        to_pil (bool): If True, will return a PIL image instead of a numpy array.
268 269
269 270
    Raises:
270 271
        SampleDecompressionError: If decompression fails.
271 272
        ValueError: If dtype and shape are not specified for byte compression.
272 273
273 274
    Returns:
274 -
        np.ndarray: Array from the decompressed buffer.
275 +
        Union[np.ndarray, Image.Image]: Decompressed array or PIL image.
275 276
    """
276 277
    compr_type = get_compression_type(compression)
277 278
    if compr_type == BYTE_COMPRESSION:
@@ -304,6 +305,8 @@
Loading
304 305
        if not isinstance(buffer, str):
305 306
            buffer = BytesIO(buffer)  # type: ignore
306 307
        img = Image.open(buffer)  # type: ignore
308 +
        if to_pil:
309 +
            return img
307 310
        arr = np.array(img)
308 311
        if shape is not None:
309 312
            arr = arr.reshape(shape)

@@ -81,6 +81,7 @@
Loading
81 81
        self._uncompressed_bytes = None
82 82
83 83
        self._array = None
84 +
        self._pil = None
84 85
        self._typestr = None
85 86
        self._shape = shape or None
86 87
        self._dtype = dtype or None
@@ -300,8 +301,8 @@
Loading
300 301
            self._compressed_bytes[compression] = compressed_bytes
301 302
        return compressed_bytes
302 303
303 -
    def _decompress(self):
304 -
        if self._array is not None:
304 +
    def _decompress(self, to_pil: bool = False):
305 +
        if not to_pil and self._array is not None:
305 306
            if self._uncompressed_bytes is None:
306 307
                self._uncompressed_bytes = self._array.tobytes()
307 308
            return
@@ -323,12 +324,24 @@
Loading
323 324
            else:
324 325
                compressed = self.buffer
325 326
326 -
            self._array = decompress_array(
327 -
                compressed, compression=compression, shape=self.shape, dtype=self.dtype
328 -
            )
329 -
            self._uncompressed_bytes = self._array.tobytes()
330 -
            self._typestr = self._array.__array_interface__["typestr"]
331 -
            self._dtype = np.dtype(self._typestr).name
327 +
            if to_pil:
328 +
                self._pil = decompress_array(
329 +
                    compressed,
330 +
                    compression=compression,
331 +
                    shape=self.shape,
332 +
                    dtype=self.dtype,
333 +
                    to_pil=True,
334 +
                )  # type: ignore
335 +
            else:
336 +
                self._array = decompress_array(
337 +
                    compressed,
338 +
                    compression=compression,
339 +
                    shape=self.shape,
340 +
                    dtype=self.dtype,
341 +
                )
342 +
                self._uncompressed_bytes = self._array.tobytes()
343 +
                self._typestr = self._array.__array_interface__["typestr"]
344 +
                self._dtype = np.dtype(self._typestr).name
332 345
333 346
    def uncompressed_bytes(self) -> Optional[bytes]:
334 347
        """Returns uncompressed bytes."""
@@ -352,6 +365,23 @@
Loading
352 365
        self._decompress()
353 366
        return self._array  # type: ignore
354 367
368 +
    @property
369 +
    def pil(self) -> Image.Image:  # type: ignore
370 +
        """Return PIL image corresponding to the sample. Decompresses the sample if necessary.
371 +
372 +
        Example:
373 +
374 +
            >>> sample = deeplake.read("./images/dog.jpg")
375 +
            >>> pil = sample.pil
376 +
            >>> pil.size
377 +
            (480, 323)
378 +
        """
379 +
        pil = self._pil
380 +
        if pil is not None:
381 +
            return pil
382 +
        self._decompress(to_pil=True)
383 +
        return self._pil
384 +
355 385
    def __str__(self):
356 386
        if self.is_lazy:
357 387
            return f"Sample(is_lazy=True, path={self.path})"

@@ -1,4 +1,5 @@
Loading
1 1
from typing import Callable, Dict, List, Optional
2 +
import warnings
2 3
from deeplake.util.exceptions import EmptyTensorError
3 4
from deeplake.util.iterable_ordered_dict import IterableOrderedDict
4 5
from deeplake.core.polygon import Polygons
@@ -82,9 +83,42 @@
Loading
82 83
    return compressed_tensors
83 84
84 85
85 -
def remove_intersections(compressed_tensors: List[str], raw_tensors: List[str]):
86 -
    compressed_tensors = [
87 -
        tensor for tensor in compressed_tensors if tensor not in raw_tensors
88 -
    ]
89 -
    raw_tensors.extend(compressed_tensors)
90 -
    return compressed_tensors, raw_tensors
86 +
def validate_decode_method(decode_method, all_tensor_keys, jpeg_png_compressed_tensors):
87 +
    raw_tensors = []
88 +
    compressed_tensors = []
89 +
    if decode_method is None:
90 +
        if len(jpeg_png_compressed_tensors) > 0:
91 +
            warnings.warn(
92 +
                f"Decode method for tensors {jpeg_png_compressed_tensors} is defaulting to numpy. Please consider specifying a decode_method in .pytorch() that maximizes the data preprocessing speed based on your transformation."
93 +
            )
94 +
        return raw_tensors, compressed_tensors
95 +
96 +
    jpeg_png_compressed_tensors_set = set(jpeg_png_compressed_tensors)
97 +
    generic_supported_decode_methods = {"numpy", "tobytes"}
98 +
    jpeg_png_supported_decode_methods = {"numpy", "tobytes", "pil"}
99 +
    for tensor_name, decode_method in decode_method.items():
100 +
        if tensor_name not in all_tensor_keys:
101 +
            raise ValueError(
102 +
                f"decode_method tensor {tensor_name} not found in tensors."
103 +
            )
104 +
        if tensor_name in jpeg_png_compressed_tensors_set:
105 +
            if decode_method not in jpeg_png_supported_decode_methods:
106 +
                raise ValueError(
107 +
                    f"decode_method {decode_method} not supported for tensor {tensor_name}. Supported methods for this tensor are {jpeg_png_supported_decode_methods}"
108 +
                )
109 +
        elif decode_method not in generic_supported_decode_methods:
110 +
            raise ValueError(
111 +
                f"decode_method {decode_method} not supported for tensor {tensor_name}. Supported methods for this tensor are {generic_supported_decode_methods}"
112 +
            )
113 +
        if decode_method == "tobytes":
114 +
            raw_tensors.append(tensor_name)
115 +
        elif decode_method == "pil":
116 +
            compressed_tensors.append(tensor_name)
117 +
118 +
    return raw_tensors, compressed_tensors
119 +
120 +
121 +
def get_collate_fn(collate, mode):
122 +
    if collate is None and mode == "pytorch":
123 +
        return collate_fn
124 +
    return collate

@@ -0,0 +1,3 @@
Loading
1 +
from deeplake.enterprise.dataloader import dataloader, DeepLakeDataLoader
2 +
from deeplake.enterprise.libdeeplake_query import query, sample_by
3 +
from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake
0 4
imilarity index 96%
1 5
ename from deeplake/experimental/convert_to_libdeeplake.py
2 6
ename to deeplake/enterprise/convert_to_libdeeplake.py

@@ -64,7 +64,7 @@
Loading
64 64
            ds_out (Dataset, optional): The dataset object to which the transform will get written. If this is not provided, data_in will be overwritten if it is a Deep Lake dataset, otherwise error will be raised.
65 65
                It should have all keys being generated in output already present as tensors. It's initial state should be either:-
66 66
                - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
67 -
                - All tensors are populated and have sampe length. In this case new samples are appended to the dataset.
67 +
                - All tensors are populated and have same length. In this case new samples are appended to the dataset.
68 68
            num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
69 69
            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
70 70
                Defaults to 'threaded'.
@@ -120,31 +120,51 @@
Loading
120 120
        pad_data_in: bool = False,
121 121
        **kwargs,
122 122
    ):
123 -
        """Evaluates the pipeline on data_in to produce an output dataset ds_out.
123 +
        """Evaluates the pipeline on ``data_in`` to produce an output dataset ``ds_out``.
124 124
125 125
        Args:
126 126
            data_in: Input passed to the transform to generate output dataset. Should support \__getitem__ and \__len__. Can be a Deep Lake dataset.
127 -
            ds_out (Dataset, optional): The dataset object to which the transform will get written. If this is not provided, data_in will be overwritten if it is a Deep Lake dataset, otherwise error will be raised.
128 -
                It should have all keys being generated in output already present as tensors. It's initial state should be either:-
129 -
                - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
130 -
                - All tensors are populated and have sampe length. In this case new samples are appended to the dataset.
127 +
            ds_out (Dataset, optional): - The dataset object to which the transform will get written. If this is not provided, ``data_in`` will be overwritten if it is a Deep Lake dataset, otherwise error will be raised.
128 +
                - It should have all keys being generated in output already present as tensors. It's initial state should be either:
129 +
                - **Empty**, i.e., all tensors have no samples. In this case all samples are added to the dataset.
130 +
                - **All tensors are populated and have same length.** In this case new samples are appended to the dataset.
131 131
            num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
132 132
            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
133 133
                Defaults to 'threaded'.
134 -
            progressbar (bool): Displays a progress bar if True (default).
135 -
            skip_ok (bool): If True, skips the check for output tensors generated. This allows the user to skip certain tensors in the function definition.
136 -
                This is especially useful for inplace transformations in which certain tensors are not modified. Defaults to False.
137 -
            check_lengths (bool): If True, checks whether ds_out has tensors of same lengths initially.
138 -
            pad_data_in (bool): NOTE: This is only applicable if data_in is a Deep Lake dataset. If True, pads tensors of data_in to match the length of the largest tensor in data_in.
139 -
                Defaults to False.
134 +
            progressbar (bool): Displays a progress bar if ``True`` (default).
135 +
            skip_ok (bool): If ``True``, skips the check for output tensors generated. This allows the user to skip certain tensors in the function definition.
136 +
                This is especially useful for inplace transformations in which certain tensors are not modified. Defaults to ``False``.
137 +
            check_lengths (bool): If ``True``, checks whether ``ds_out`` has tensors of same lengths initially.
138 +
            pad_data_in (bool): If ``True``, pads tensors of ``data_in`` to match the length of the largest tensor in ``data_in``.
139 +
                Defaults to ``False``.
140 140
            **kwargs: Additional arguments.
141 141
142 142
        Raises:
143 -
            InvalidInputDataError: If data_in passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as data_in will also raise this.
144 -
            InvalidOutputDatasetError: If all the tensors of ds_out passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ds_out will also raise this.
143 +
            InvalidInputDataError: If ``data_in`` passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``data_in`` will also raise this.
144 +
            InvalidOutputDatasetError: If all the tensors of ``ds_out`` passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``ds_out`` will also raise this.
145 145
            TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
146 146
            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
147 147
            TransformError: All other exceptions raised if there are problems while running the pipeline.
148 +
149 +
        Example::
150 +
151 +
            @deeplake.compute
152 +
            def my_fn(sample_in: Any, samples_out, my_arg0, my_arg1=0):
153 +
                samples_out.my_tensor.append(my_arg0 * my_arg1)
154 +
155 +
            # This transform can be used using the eval method in one of these 2 ways:-
156 +
157 +
            # Directly evaluating the method
158 +
            # here arg0 and arg1 correspond to the 3rd and 4th argument in my_fn
159 +
            my_fn(arg0, arg1).eval(data_in, ds_out, scheduler="threaded", num_workers=5)
160 +
161 +
            # As a part of a Transform pipeline containing other functions
162 +
            pipeline = deeplake.compose([my_fn(a, b), another_function(x=2)])
163 +
            pipeline.eval(data_in, ds_out, scheduler="processed", num_workers=2)
164 +
165 +
        Note:
166 +
            ``pad_data_in`` is only applicable if ``data_in`` is a Deep Lake dataset.
167 +
148 168
        """
149 169
        num_workers, scheduler = sanitize_workers_scheduler(num_workers, scheduler)
150 170
        overwrite = ds_out is None
@@ -449,20 +469,27 @@
Loading
449 469
450 470
        - Supported values include: 'serial', 'threaded', 'processed' and 'ray'. Defaults to 'threaded'.
451 471
452 -
    - ``progressbar (bool)``: Displays a progress bar if True (default).
472 +
    - ``progressbar (bool)``: Displays a progress bar if ``True`` (default).
453 473
454 -
    - ``skip_ok (bool)``: If True, skips the check for output tensors generated.
474 +
    - ``skip_ok (bool)``: If ``True``, skips the check for output tensors generated.
455 475
456 476
        - This allows the user to skip certain tensors in the function definition.
457 477
        - This is especially useful for inplace transformations in which certain tensors are not modified. Defaults to ``False``.
458 478
479 +
    - ``check_lengths (bool)``: If ``True``, checks whether ``ds_out`` has tensors of same lengths initially.
480 +
481 +
    - ``pad_data_in (bool)``: If ``True``, pads tensors of ``data_in`` to match the length of the largest tensor in ``data_in``. Defaults to ``False``.
482 +
483 +
    Note:
484 +
        ``pad_data_in`` is only applicable if ``data_in`` is a Deep Lake dataset.
485 +
459 486
    It raises the following errors:
460 487
461 -
    - ``InvalidInputDataError``: If data_in passed to transform is invalid. It should support ``__getitem__`` and ``__len__`` operations. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as data_in will also raise this.
488 +
    - ``InvalidInputDataError``: If ``data_in`` passed to transform is invalid. It should support ``__getitem__`` and ``__len__`` operations. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``data_in`` will also raise this.
462 489
463 -
    - ``InvalidOutputDatasetError``: If all the tensors of ds_out passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ds_out will also raise this.
490 +
    - ``InvalidOutputDatasetError``: If all the tensors of ``ds_out`` passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``ds_out`` will also raise this.
464 491
465 -
    - ``TensorMismatchError``: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
492 +
    - ``TensorMismatchError``: If one or more of the outputs generated during transform contain different tensors than the ones present in ``ds_out`` provided to transform.
466 493
467 494
    - ``UnsupportedSchedulerError``: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
468 495
Files Coverage
deeplake 89.15%
conftest.py 100.00%
setup.py 0.00%
Project Totals (253 files) 89.04%
3471493864
PYTHON=undefined
OS=undefined
unittests

No yaml found.

Create your codecov.yml to customize your Codecov experience

Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading