openpilot1/tools/lib/url_file.py

165 lines
5.5 KiB
Python

import logging
import os
import socket
import time
from hashlib import sha256
from urllib3 import PoolManager, Retry
from urllib3.response import BaseHTTPResponse
from urllib3.util import Timeout
from openpilot.common.file_helpers import atomic_write_in_dir
from openpilot.system.hardware.hw import Paths
# Cache chunk size
K = 1000
CHUNK_SIZE = 1000 * K
logging.getLogger("urllib3").setLevel(logging.WARNING)
def hash_256(link: str) -> str:
hsh = str(sha256((link.split("?")[0]).encode('utf-8')).hexdigest())
return hsh
class URLFileException(Exception):
pass
class URLFile:
_pool_manager: PoolManager|None = None
@staticmethod
def reset() -> None:
URLFile._pool_manager = None
@staticmethod
def pool_manager() -> PoolManager:
if URLFile._pool_manager is None:
socket_options = [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),]
retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[409, 429, 503, 504])
URLFile._pool_manager = PoolManager(num_pools=10, maxsize=100, socket_options=socket_options, retries=retries)
return URLFile._pool_manager
def __init__(self, url: str, timeout: int=10, debug: bool=False, cache: bool|None=None):
self._url = url
self._timeout = Timeout(connect=timeout, read=timeout)
self._pos = 0
self._length: int|None = None
self._debug = debug
# True by default, false if FILEREADER_CACHE is defined, but can be overwritten by the cache input
self._force_download = not int(os.environ.get("FILEREADER_CACHE", "0"))
if cache is not None:
self._force_download = not cache
if not self._force_download:
os.makedirs(Paths.download_cache_root(), exist_ok=True)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback) -> None:
pass
def _request(self, method: str, url: str, headers: dict[str, str]|None=None) -> BaseHTTPResponse:
return URLFile.pool_manager().request(method, url, timeout=self._timeout, headers=headers)
def get_length_online(self) -> int:
response = self._request('HEAD', self._url)
if not (200 <= response.status <= 299):
return -1
length = response.headers.get('content-length', 0)
return int(length)
def get_length(self) -> int:
if self._length is not None:
return self._length
file_length_path = os.path.join(Paths.download_cache_root(), hash_256(self._url) + "_length")
if not self._force_download and os.path.exists(file_length_path):
with open(file_length_path) as file_length:
content = file_length.read()
self._length = int(content)
return self._length
self._length = self.get_length_online()
if not self._force_download and self._length != -1:
with atomic_write_in_dir(file_length_path, mode="w") as file_length:
file_length.write(str(self._length))
return self._length
def read(self, ll: int|None=None) -> bytes:
if self._force_download:
return self.read_aux(ll=ll)
file_begin = self._pos
file_end = self._pos + ll if ll is not None else self.get_length()
assert file_end != -1, f"Remote file is empty or doesn't exist: {self._url}"
# We have to align with chunks we store. Position is the begginiing of the latest chunk that starts before or at our file
position = (file_begin // CHUNK_SIZE) * CHUNK_SIZE
response = b""
while True:
self._pos = position
chunk_number = self._pos / CHUNK_SIZE
file_name = hash_256(self._url) + "_" + str(chunk_number)
full_path = os.path.join(Paths.download_cache_root(), str(file_name))
data = None
# If we don't have a file, download it
if not os.path.exists(full_path):
data = self.read_aux(ll=CHUNK_SIZE)
with atomic_write_in_dir(full_path, mode="wb") as new_cached_file:
new_cached_file.write(data)
else:
with open(full_path, "rb") as cached_file:
data = cached_file.read()
response += data[max(0, file_begin - position): min(CHUNK_SIZE, file_end - position)]
position += CHUNK_SIZE
if position >= file_end:
self._pos = file_end
return response
def read_aux(self, ll: int|None=None) -> bytes:
download_range = False
headers = {}
if self._pos != 0 or ll is not None:
if ll is None:
end = self.get_length() - 1
else:
end = min(self._pos + ll, self.get_length()) - 1
if self._pos >= end:
return b""
headers['Range'] = f"bytes={self._pos}-{end}"
download_range = True
if self._debug:
t1 = time.time()
response = self._request('GET', self._url, headers=headers)
ret = response.data
if self._debug:
t2 = time.time()
if t2 - t1 > 0.1:
print(f"get {self._url} {headers!r} {t2 - t1:.3f} slow")
response_code = response.status
if response_code == 416: # Requested Range Not Satisfiable
raise URLFileException(f"Error, range out of bounds {response_code} {headers} ({self._url}): {repr(ret)[:500]}")
if download_range and response_code != 206: # Partial Content
raise URLFileException(f"Error, requested range but got unexpected response {response_code} {headers} ({self._url}): {repr(ret)[:500]}")
if (not download_range) and response_code != 200: # OK
raise URLFileException(f"Error {response_code} {headers} ({self._url}): {repr(ret)[:500]}")
self._pos += len(ret)
return ret
def seek(self, pos:int) -> None:
self._pos = pos
@property
def name(self) -> str:
return self._url
os.register_at_fork(after_in_child=URLFile.reset)