Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import logging
from pathlib import Path
from typing import Iterator, Tuple, Union
import numpy as np
import torch
from torch.utils.data.dataset import Dataset
RetT = Tuple[torch.Tensor, torch.Tensor]
msg_logger = logging.getLogger()
PathLike = Union[Path, str]
class SingleFileDataset(Dataset):
def __init__(self, inputs: torch.Tensor, outputs: torch.Tensor):
self.inputs, self.outputs = inputs, outputs
@classmethod
def from_file(cls, *args, **kwargs):
pass
@property
def sample_input(self):
inputs, outputs = next(iter(self))
return inputs
def __len__(self) -> int:
return len(self.inputs)
def __getitem__(self, idx) -> RetT:
return self.inputs[idx], self.outputs[idx]
def __iter__(self) -> Iterator[RetT]:
for i in range(len(self)):
yield self[i]
class DNNDataset(SingleFileDataset):
image_shape = None
label_ty = np.int32
@classmethod
def from_file(
cls,
input_file: PathLike,
labels_file: PathLike,
count: int = -1,
offset: int = 0,
):
# NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC)
channel_size = np.prod(np.array(cls.image_shape))
inputs_count_byte = -1 if count == -1 else count * channel_size
inputs = read_tensor_from_file(
input_file,
-1,
*cls.image_shape,
count=inputs_count_byte,
offset=offset * channel_size,
)
labels = read_tensor_from_file(
labels_file,
-1,
read_ty=cls.label_ty,
cast_ty=np.long,
count=count,
offset=offset,
)
if inputs.shape[0] != labels.shape[0]:
raise ValueError("Input and output have different number of data points")
msg_logger.info(f"%d entries loaded from dataset.", inputs.shape[0])
return cls(inputs, labels)
class MNIST(DNNDataset):
image_shape = 1, 28, 28
class CIFAR(DNNDataset):
image_shape = 3, 32, 32
class ImageNet(DNNDataset):
image_shape = 3, 224, 224
def read_tensor_from_file(
filename: Union[str, Path],
*shape: int,
read_ty=np.float32,
cast_ty=np.float32,
count: int = -1,
offset: int = 0,
) -> torch.Tensor:
offset = offset * read_ty().itemsize
mmap = np.memmap(filename, dtype=read_ty, mode="r", offset=offset)
n_entries = min(mmap.shape[0], count) if count != -1 else mmap.shape[0]
np_array = mmap[:n_entries].reshape(shape).astype(cast_ty)
return torch.from_numpy(np_array).clone()