diff --git a/datasets/imagenet_download.py b/datasets/imagenet_download.py new file mode 100644 index 00000000..71d9e55c --- /dev/null +++ b/datasets/imagenet_download.py @@ -0,0 +1,51 @@ +# Python version of https://gist.github.com/antoinebrl/7d00d5cb6c95ef194c737392ef7e476a +from extra.utils import download_file +from pathlib import Path +from tqdm import tqdm +import tarfile, os + +def imagenet_extract(file, path, small=False): + with tarfile.open(name=file) as tar: + if small: # Show progressbar only for big files + for member in tar.getmembers(): tar.extract(path=path, member=member) + else: + for member in tqdm(iterable=tar.getmembers(), total=len(tar.getmembers())): tar.extract(path=path, member=member) + tar.close() + +def imagenet_prepare_val(): + # Read in the labels file + with open(Path(__file__).parent.parent / "datasets/imagenet/imagenet_2012_validation_synset_labels.txt", 'r') as f: + labels = f.read().splitlines() + f.close() + # Get a list of images + images = os.listdir(Path(__file__).parent.parent / "datasets/imagenet/val") + images.sort() + # Create folders and move files into those + for co,dir in enumerate(labels): + os.makedirs(Path(__file__).parent.parent / "datasets/imagenet/val" / dir, exist_ok=True) + os.replace(Path(__file__).parent.parent / "datasets/imagenet/val" / images[co], Path(__file__).parent.parent / "datasets/imagenet/val" / dir / images[co], exist_ok=True) + os.remove(Path(__file__).parent.parent / "datasets/imagenet/imagenet_2012_validation_synset_labels.txt") + +def imagenet_prepare_train(): + images = os.listdir(Path(__file__).parent.parent / "datasets/imagenet/train") + for co,tarf in enumerate(images): + # for each tar file found. Create a folder with its name. Extract into that folder. Remove tar file + if Path(Path(__file__).parent.parent / "datasets/imagenet/train" / images[co]).is_file(): + images[co] = tarf[:-4] # remove .tar from extracted tar files + os.makedirs(Path(__file__).parent.parent / "datasets/imagenet/train" / images[co], exist_ok=True) + imagenet_extract(Path(__file__).parent.parent / "datasets/imagenet/train" / tarf, Path(__file__).parent.parent / "datasets/imagenet/train" / images[co], small=True) + os.remove(Path(__file__).parent.parent / "datasets/imagenet/train" / tarf) + +if __name__ == "__main__": + os.makedirs(Path(__file__).parent.parent / "datasets/imagenet", exist_ok=True) + os.makedirs(Path(__file__).parent.parent / "datasets/imagenet/val", exist_ok=True) + os.makedirs(Path(__file__).parent.parent / "datasets/imagenet/train", exist_ok=True) + download_file("https://raw.githubusercontent.com/raghakot/keras-vis/master/resources/imagenet_class_index.json", Path(__file__).parent.parent / "datasets/imagenet/imagenet_class_index.json") + download_file("https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_2012_validation_synset_labels.txt", Path(__file__).parent.parent / "datasets/imagenet/imagenet_2012_validation_synset_labels.txt") + download_file("https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar", Path(__file__).parent.parent / "datasets/imagenet/ILSVRC2012_img_val.tar") # 7GB + imagenet_extract(Path(__file__).parent.parent / "datasets/imagenet/ILSVRC2012_img_val.tar", Path(__file__).parent.parent / "datasets/imagenet/val") + imagenet_prepare_val() + if os.getenv['IMGNET_TRAIN'] is not None: + download_file("https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar", Path(__file__).parent.parent / "datasets/imagenet/ILSVRC2012_img_train.tar") #138GB! + imagenet_extract(Path(__file__).parent.parent / "datasets/imagenet/ILSVRC2012_img_train.tar", Path(__file__).parent.parent / "datasets/imagenet/train") + imagenet_prepare_train() diff --git a/docs/env_vars.md b/docs/env_vars.md index e1cdbd27..790eed06 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -184,3 +184,9 @@ CI | [1] | disables some tests for CI Variable | Possible Value(s) | Description ---|---|--- BS | [8, 16, 32, 64, 128] | batch size to use + +### datasets/imagenet_download.py + +Variable | Possible Value(s) | Description +---|---|--- +IMGNET_TRAIN | [1] | download also training data with imagenet