diff --git a/extra/datasets/coco.py b/extra/datasets/coco.py index b5d18886..008efae4 100644 --- a/extra/datasets/coco.py +++ b/extra/datasets/coco.py @@ -12,7 +12,7 @@ iou = _mask.iou merge = _mask.merge frPyObjects = _mask.frPyObjects -BASEDIR = pathlib.Path(__file__).parent.parent / "extra" / "datasets" / "COCO" +BASEDIR = pathlib.Path(__file__).parent / "COCO" BASEDIR.mkdir(exist_ok=True) def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for row in rows} diff --git a/extra/datasets/imagenet.py b/extra/datasets/imagenet.py index 5048314b..dde32a5e 100644 --- a/extra/datasets/imagenet.py +++ b/extra/datasets/imagenet.py @@ -5,7 +5,7 @@ import numpy as np from PIL import Image import functools, pathlib -BASEDIR = pathlib.Path(__file__).parent.parent / "extra/datasets/imagenet" +BASEDIR = pathlib.Path(__file__).parent / "imagenet" ci = json.load(open(BASEDIR / "imagenet_class_index.json")) cir = {v[0]: int(k) for k,v in ci.items()} diff --git a/extra/datasets/imagenet_download.py b/extra/datasets/imagenet_download.py index ebb017e4..7eca9dd2 100644 --- a/extra/datasets/imagenet_download.py +++ b/extra/datasets/imagenet_download.py @@ -14,38 +14,38 @@ def imagenet_extract(file, path, small=False): def imagenet_prepare_val(): # Read in the labels file - with open(Path(__file__).parent.parent / "extra/datasets/imagenet/imagenet_2012_validation_synset_labels.txt", 'r') as f: + with open(Path(__file__).parent / "imagenet" / "imagenet_2012_validation_synset_labels.txt", 'r') as f: labels = f.read().splitlines() f.close() # Get a list of images - images = os.listdir(Path(__file__).parent.parent / "extra/datasets/imagenet/val") + images = os.listdir(Path(__file__).parent / "imagenet" / "val") images.sort() # Create folders and move files into those for co,dir in enumerate(labels): - os.makedirs(Path(__file__).parent.parent / "extra/datasets/imagenet/val" / dir, exist_ok=True) - os.replace(Path(__file__).parent.parent / "extra/datasets/imagenet/val" / images[co], Path(__file__).parent.parent / "extra/datasets/imagenet/val" / dir / images[co]) - os.remove(Path(__file__).parent.parent / "extra/datasets/imagenet/imagenet_2012_validation_synset_labels.txt") + os.makedirs(Path(__file__).parent / "imagenet" / "val" / dir, exist_ok=True) + os.replace(Path(__file__).parent / "imagenet" / "val" / images[co], Path(__file__).parent / "imagenet" / "val" / dir / images[co]) + os.remove(Path(__file__).parent / "imagenet" / "imagenet_2012_validation_synset_labels.txt") def imagenet_prepare_train(): - images = os.listdir(Path(__file__).parent.parent / "extra/datasets/imagenet/train") + images = os.listdir(Path(__file__).parent / "imagenet" / "train") for co,tarf in enumerate(images): # for each tar file found. Create a folder with its name. Extract into that folder. Remove tar file - if Path(Path(__file__).parent.parent / "extra/datasets/imagenet/train" / images[co]).is_file(): + if Path(Path(__file__).parent / "imagenet" / "train" / images[co]).is_file(): images[co] = tarf[:-4] # remove .tar from extracted tar files - os.makedirs(Path(__file__).parent.parent / "extra/datasets/imagenet/train" / images[co], exist_ok=True) - imagenet_extract(Path(__file__).parent.parent / "extra/datasets/imagenet/train" / tarf, Path(__file__).parent.parent / "extra/datasets/imagenet/train" / images[co], small=True) - os.remove(Path(__file__).parent.parent / "extra/datasets/imagenet/train" / tarf) + os.makedirs(Path(__file__).parent / "imagenet" / "train" / images[co], exist_ok=True) + imagenet_extract(Path(__file__).parent / "imagenet" / "train" / tarf, Path(__file__).parent/ "imagenet" / "train" / images[co], small=True) + os.remove(Path(__file__).parent / "imagenet" / "train" / tarf) if __name__ == "__main__": - os.makedirs(Path(__file__).parent.parent / "extra/datasets/imagenet", exist_ok=True) - os.makedirs(Path(__file__).parent.parent / "extra/datasets/imagenet/val", exist_ok=True) - os.makedirs(Path(__file__).parent.parent / "extra/datasets/imagenet/train", exist_ok=True) - download_file("https://raw.githubusercontent.com/raghakot/keras-vis/master/resources/imagenet_class_index.json", Path(__file__).parent.parent / "extra/datasets/imagenet/imagenet_class_index.json") - download_file("https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_2012_validation_synset_labels.txt", Path(__file__).parent.parent / "extra/datasets/imagenet/imagenet_2012_validation_synset_labels.txt") - download_file("https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar", Path(__file__).parent.parent / "extra/datasets/imagenet/ILSVRC2012_img_val.tar") # 7GB - imagenet_extract(Path(__file__).parent.parent / "extra/datasets/imagenet/ILSVRC2012_img_val.tar", Path(__file__).parent.parent / "extra/datasets/imagenet/val") + os.makedirs(Path(__file__).parent / "imagenet", exist_ok=True) + os.makedirs(Path(__file__).parent / "imagenet" / "val", exist_ok=True) + os.makedirs(Path(__file__).parent / "imagenet" / "train", exist_ok=True) + download_file("https://raw.githubusercontent.com/raghakot/keras-vis/master/resources/imagenet_class_index.json", Path(__file__).parent / "imagenet" / "imagenet_class_index.json") + download_file("https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_2012_validation_synset_labels.txt", Path(__file__).parent / "imagenet"/ "imagenet_2012_validation_synset_labels.txt") + download_file("https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar", Path(__file__).parent / "imagenet" / "ILSVRC2012_img_val.tar") # 7GB + imagenet_extract(Path(__file__).parent / "imagenet" / "ILSVRC2012_img_val.tar", Path(__file__).parent / "imagenet" / "val") imagenet_prepare_val() if os.getenv('IMGNET_TRAIN', None) is not None: - download_file("https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar", Path(__file__).parent.parent / "extra/datasets/imagenet/ILSVRC2012_img_train.tar") #138GB! - imagenet_extract(Path(__file__).parent.parent / "extra/datasets/imagenet/ILSVRC2012_img_train.tar", Path(__file__).parent.parent / "extra/datasets/imagenet/train") + download_file("https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar", Path(__file__).parent / "imagenet" / "ILSVRC2012_img_train.tar") #138GB! + imagenet_extract(Path(__file__).parent / "imagenet" / "ILSVRC2012_img_train.tar", Path(__file__).parent / "imagenet" / "train") imagenet_prepare_train() diff --git a/extra/datasets/kits19.py b/extra/datasets/kits19.py index b49d3cc0..da51576f 100644 --- a/extra/datasets/kits19.py +++ b/extra/datasets/kits19.py @@ -9,7 +9,7 @@ import torch import torch.nn.functional as F from tinygrad.tensor import Tensor -BASEDIR = Path(__file__).parent.parent.resolve() / "extra" / "datasets" / "kits19" / "data" +BASEDIR = Path(__file__).parent / "kits19" / "data" """ To download the dataset: diff --git a/extra/datasets/librispeech.py b/extra/datasets/librispeech.py index 2e85a335..434d7a05 100644 --- a/extra/datasets/librispeech.py +++ b/extra/datasets/librispeech.py @@ -15,7 +15,7 @@ for file in $(find * | grep flac); do ffmpeg -i $file -ar 16k "$(dirname $file)/ Then this [file](https://github.com/mlcommons/inference/blob/master/speech_recognition/rnnt/dev-clean-wav.json) has to also be put in `extra/datasets/librispeech`. """ -BASEDIR = pathlib.Path(__file__).parent.parent / "extra/datasets/librispeech" +BASEDIR = pathlib.Path(__file__).parent / "librispeech" with open(BASEDIR / "dev-clean-wav.json") as f: ci = json.load(f) diff --git a/extra/datasets/openimages.py b/extra/datasets/openimages.py index 641b6bd6..8e411e06 100644 --- a/extra/datasets/openimages.py +++ b/extra/datasets/openimages.py @@ -11,7 +11,7 @@ from tqdm import tqdm import pandas as pd import concurrent.futures -BASEDIR = pathlib.Path(__file__).parent.parent / "extra/datasets/open-images-v6-mlperf" +BASEDIR = pathlib.Path(__file__).parent / "open-images-v6-mlperf" BUCKET_NAME = "open-images-dataset" BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv" MAP_CLASSES_URL = "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv" diff --git a/extra/datasets/squad.py b/extra/datasets/squad.py index a69ee986..f9c1c7c9 100644 --- a/extra/datasets/squad.py +++ b/extra/datasets/squad.py @@ -5,7 +5,7 @@ from transformers import BertTokenizer import numpy as np from extra.utils import download_file -BASEDIR = Path(__file__).parent.parent / "extra/datasets/squad" +BASEDIR = Path(__file__).parent / "squad" def init_dataset(): os.makedirs(BASEDIR, exist_ok=True) download_file("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json", BASEDIR / "dev-v1.1.json") @@ -141,7 +141,7 @@ def iterate(tokenizer, start=0): yield features, example if __name__ == "__main__": - tokenizer = BertTokenizer(str(Path(__file__).parent.parent / "weights/bert_vocab.txt")) + tokenizer = BertTokenizer(str(Path(__file__).parent.parent.parent / "weights" / "bert_vocab.txt")) X, Y = next(iterate(tokenizer)) print(" ".join(X[0]["tokens"]))