tinygrad/extra/datasets/squad.py

import json
import os
from pathlib import Path
from transformers import BertTokenizer
import numpy as np
from extra.utils import download_file

BASEDIR = Path(__file__).parent.parent / "extra/datasets/squad"
def init_dataset():
  os.makedirs(BASEDIR, exist_ok=True)
  download_file("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json", BASEDIR / "dev-v1.1.json")
  with open(BASEDIR / "dev-v1.1.json") as f:
    data = json.load(f)["data"]

  examples = []
  for article in data:
    for paragraph in article["paragraphs"]:
      text = paragraph["context"]
      doc_tokens = []
      prev_is_whitespace = True
      for c in text:
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
          prev_is_whitespace = True
        else:
          if prev_is_whitespace:
            doc_tokens.append(c)
          else:
            doc_tokens[-1] += c
          prev_is_whitespace = False

      for qa in paragraph["qas"]:
        qa_id = qa["id"]
        q_text = qa["question"]

        examples.append({
          "id": qa_id,
          "question": q_text,
          "context": doc_tokens,
          "answers": list(map(lambda x: x["text"], qa["answers"]))
        })
  return examples

def _check_is_max_context(doc_spans, cur_span_index, position):
  best_score, best_span_index = None, None
  for di, (doc_start, doc_length) in enumerate(doc_spans):
    end = doc_start + doc_length - 1
    if position < doc_start:
      continue
    if position > end:
      continue
    num_left_context = position - doc_start
    num_right_context = end - position
    score = min(num_left_context, num_right_context) + 0.01 * doc_length
    if best_score is None or score > best_score:
      best_score = score
      best_span_index = di
  return cur_span_index == best_span_index

def convert_example_to_features(example, tokenizer):
  query_tokens = tokenizer.tokenize(example["question"])

  if len(query_tokens) > 64:
    query_tokens = query_tokens[:64]

  tok_to_orig_index = []
  orig_to_tok_index = []
  all_doc_tokens = []
  for i, token in enumerate(example["context"]):
    orig_to_tok_index.append(len(all_doc_tokens))
    sub_tokens = tokenizer.tokenize(token)
    for sub_token in sub_tokens:
      tok_to_orig_index.append(i)
      all_doc_tokens.append(sub_token)

  max_tokens_for_doc = 384 - len(query_tokens) - 3

  doc_spans = []
  start_offset = 0
  while start_offset < len(all_doc_tokens):
    length = len(all_doc_tokens) - start_offset
    length = min(length, max_tokens_for_doc)
    doc_spans.append((start_offset, length))
    if start_offset + length == len(all_doc_tokens):
      break
    start_offset += min(length, 128)

  outputs = []
  for di, (doc_start, doc_length) in enumerate(doc_spans):
    tokens = []
    token_to_orig_map = {}
    token_is_max_context = {}
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in query_tokens:
      tokens.append(token)
      segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    for i in range(doc_length):
      split_token_index = doc_start + i
      token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
      token_is_max_context[len(tokens)] = _check_is_max_context(doc_spans, di, split_token_index)
      tokens.append(all_doc_tokens[split_token_index])
      segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    while len(input_ids) < 384:
      input_ids.append(0)
      input_mask.append(0)
      segment_ids.append(0)

    assert len(input_ids) == 384
    assert len(input_mask) == 384
    assert len(segment_ids) == 384

    outputs.append({
      "input_ids": np.expand_dims(np.array(input_ids), 0).astype(np.float32),
      "input_mask": np.expand_dims(np.array(input_mask), 0).astype(np.float32),
      "segment_ids": np.expand_dims(np.array(segment_ids), 0).astype(np.float32),
      "token_to_orig_map": token_to_orig_map,
      "token_is_max_context": token_is_max_context,
      "tokens": tokens,
    })

  return outputs

def iterate(tokenizer, start=0):
  examples = init_dataset()
  print(f"there are {len(examples)} pairs in the dataset")

  for i in range(start, len(examples)):
    example = examples[i]
    features = convert_example_to_features(example, tokenizer)
    # we need to yield all features here as the f1 score is the maximum over all features
    yield features, example

if __name__ == "__main__":
  tokenizer = BertTokenizer(str(Path(__file__).parent.parent / "weights/bert_vocab.txt"))

  X, Y = next(iterate(tokenizer))
  print(" ".join(X[0]["tokens"]))
  print(X[0]["input_ids"].shape, Y)
Add mlperf bert model (#803) * feat: add mlperf bert model * feat: switch to nn.Embedding * clean+fix: fix formatting * feat: add simple downloader * feat: metrics * feat: don't actually need exact match * feat: doing a run * feat: set eps on the layernorms * clean+fix: cleaner impl + hopefully fixed * feat: move dataset initialization into iterate * feat: move tokenizer out of iterate * clean+fix: cleaner + working * clean: cleanup * fix: fix metrics * feat: need to use original bert gelu + download vocab * feat: make directory if it doesn't exist yet * feat: jit go brrr 2023-05-28 05:53:32 +08:00			`import json`
			`import os`
			`from pathlib import Path`
			`from transformers import BertTokenizer`
			`import numpy as np`
			`from extra.utils import download_file`

Fix naming conflict with huggingface datasets (#1161) * Rename in files * Move files * Moved to extra/datasets as suggested * Changes to files * Fixed stupid mistake --------- Co-authored-by: terafo <terafo@protonmail.com> 2023-07-08 01:43:44 +08:00			`BASEDIR = Path(__file__).parent.parent / "extra/datasets/squad"`
Add mlperf bert model (#803) * feat: add mlperf bert model * feat: switch to nn.Embedding * clean+fix: fix formatting * feat: add simple downloader * feat: metrics * feat: don't actually need exact match * feat: doing a run * feat: set eps on the layernorms * clean+fix: cleaner impl + hopefully fixed * feat: move dataset initialization into iterate * feat: move tokenizer out of iterate * clean+fix: cleaner + working * clean: cleanup * fix: fix metrics * feat: need to use original bert gelu + download vocab * feat: make directory if it doesn't exist yet * feat: jit go brrr 2023-05-28 05:53:32 +08:00			`def init_dataset():`
			`os.makedirs(BASEDIR, exist_ok=True)`
			`download_file("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json", BASEDIR / "dev-v1.1.json")`
			`with open(BASEDIR / "dev-v1.1.json") as f:`
			`data = json.load(f)["data"]`

			`examples = []`
			`for article in data:`
			`for paragraph in article["paragraphs"]:`
			`text = paragraph["context"]`
			`doc_tokens = []`
			`prev_is_whitespace = True`
			`for c in text:`
			`if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:`
			`prev_is_whitespace = True`
			`else:`
			`if prev_is_whitespace:`
			`doc_tokens.append(c)`
			`else:`
			`doc_tokens[-1] += c`
			`prev_is_whitespace = False`

			`for qa in paragraph["qas"]:`
			`qa_id = qa["id"]`
			`q_text = qa["question"]`

			`examples.append({`
			`"id": qa_id,`
			`"question": q_text,`
			`"context": doc_tokens,`
			`"answers": list(map(lambda x: x["text"], qa["answers"]))`
			`})`
			`return examples`

			`def _check_is_max_context(doc_spans, cur_span_index, position):`
			`best_score, best_span_index = None, None`
			`for di, (doc_start, doc_length) in enumerate(doc_spans):`
			`end = doc_start + doc_length - 1`
			`if position < doc_start:`
			`continue`
			`if position > end:`
			`continue`
			`num_left_context = position - doc_start`
			`num_right_context = end - position`
			`score = min(num_left_context, num_right_context) + 0.01 * doc_length`
			`if best_score is None or score > best_score:`
			`best_score = score`
			`best_span_index = di`
			`return cur_span_index == best_span_index`

			`def convert_example_to_features(example, tokenizer):`
			`query_tokens = tokenizer.tokenize(example["question"])`

			`if len(query_tokens) > 64:`
			`query_tokens = query_tokens[:64]`

			`tok_to_orig_index = []`
			`orig_to_tok_index = []`
			`all_doc_tokens = []`
			`for i, token in enumerate(example["context"]):`
			`orig_to_tok_index.append(len(all_doc_tokens))`
			`sub_tokens = tokenizer.tokenize(token)`
			`for sub_token in sub_tokens:`
			`tok_to_orig_index.append(i)`
			`all_doc_tokens.append(sub_token)`

			`max_tokens_for_doc = 384 - len(query_tokens) - 3`

			`doc_spans = []`
			`start_offset = 0`
			`while start_offset < len(all_doc_tokens):`
			`length = len(all_doc_tokens) - start_offset`
			`length = min(length, max_tokens_for_doc)`
			`doc_spans.append((start_offset, length))`
			`if start_offset + length == len(all_doc_tokens):`
			`break`
			`start_offset += min(length, 128)`

			`outputs = []`
			`for di, (doc_start, doc_length) in enumerate(doc_spans):`
			`tokens = []`
			`token_to_orig_map = {}`
			`token_is_max_context = {}`
			`segment_ids = []`
			`tokens.append("[CLS]")`
			`segment_ids.append(0)`
			`for token in query_tokens:`
			`tokens.append(token)`
			`segment_ids.append(0)`
			`tokens.append("[SEP]")`
			`segment_ids.append(0)`

			`for i in range(doc_length):`
			`split_token_index = doc_start + i`
			`token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]`
			`token_is_max_context[len(tokens)] = _check_is_max_context(doc_spans, di, split_token_index)`
			`tokens.append(all_doc_tokens[split_token_index])`
			`segment_ids.append(1)`
			`tokens.append("[SEP]")`
			`segment_ids.append(1)`

			`input_ids = tokenizer.convert_tokens_to_ids(tokens)`
			`input_mask = [1] * len(input_ids)`

			`while len(input_ids) < 384:`
			`input_ids.append(0)`
			`input_mask.append(0)`
			`segment_ids.append(0)`

			`assert len(input_ids) == 384`
			`assert len(input_mask) == 384`
			`assert len(segment_ids) == 384`

			`outputs.append({`
			`"input_ids": np.expand_dims(np.array(input_ids), 0).astype(np.float32),`
			`"input_mask": np.expand_dims(np.array(input_mask), 0).astype(np.float32),`
			`"segment_ids": np.expand_dims(np.array(segment_ids), 0).astype(np.float32),`
			`"token_to_orig_map": token_to_orig_map,`
			`"token_is_max_context": token_is_max_context,`
			`"tokens": tokens,`
			`})`

			`return outputs`

			`def iterate(tokenizer, start=0):`
			`examples = init_dataset()`
			`print(f"there are {len(examples)} pairs in the dataset")`

			`for i in range(start, len(examples)):`
			`example = examples[i]`
			`features = convert_example_to_features(example, tokenizer)`
			`# we need to yield all features here as the f1 score is the maximum over all features`
			`yield features, example`

			`if __name__ == "__main__":`
			`tokenizer = BertTokenizer(str(Path(__file__).parent.parent / "weights/bert_vocab.txt"))`

			`X, Y = next(iterate(tokenizer))`
			`print(" ".join(X[0]["tokens"]))`
			`print(X[0]["input_ids"].shape, Y)`