Spaces:

xiaoming32236046
/

yolov10_CTC

Sleeping

App Files Files Community

yolov10_CTC / ultralytics /hub /session.py

xiaoming32236046

Upload 325 files

53ad959 verified 4 months ago

raw

history blame

14.6 kB

	# Ultralytics YOLO 🚀, AGPL-3.0 license

	import threading
	import time
	from http import HTTPStatus
	from pathlib import Path

	import requests

	from ultralytics.hub.utils import HUB_WEB_ROOT, HELP_MSG, PREFIX, TQDM
	from ultralytics.utils import LOGGER, SETTINGS, __version__, checks, emojis, is_colab
	from ultralytics.utils.errors import HUBModelError

	AGENT_NAME = f"python-{__version__}-colab" if is_colab() else f"python-{__version__}-local"


	class HUBTrainingSession:
	"""
	HUB training session for Ultralytics HUB YOLO models. Handles model initialization, heartbeats, and checkpointing.

	Attributes:
	agent_id (str): Identifier for the instance communicating with the server.
	model_id (str): Identifier for the YOLO model being trained.
	model_url (str): URL for the model in Ultralytics HUB.
	api_url (str): API URL for the model in Ultralytics HUB.
	auth_header (dict): Authentication header for the Ultralytics HUB API requests.
	rate_limits (dict): Rate limits for different API calls (in seconds).
	timers (dict): Timers for rate limiting.
	metrics_queue (dict): Queue for the model's metrics.
	model (dict): Model data fetched from Ultralytics HUB.
	alive (bool): Indicates if the heartbeat loop is active.
	"""

	def __init__(self, identifier):
	"""
	Initialize the HUBTrainingSession with the provided model identifier.

	Args:
	identifier (str): Model identifier used to initialize the HUB training session.
	It can be a URL string or a model key with specific format.

	Raises:
	ValueError: If the provided model identifier is invalid.
	ConnectionError: If connecting with global API key is not supported.
	ModuleNotFoundError: If hub-sdk package is not installed.
	"""
	from hub_sdk import HUBClient

	self.rate_limits = {
	"metrics": 3.0,
	"ckpt": 900.0,
	"heartbeat": 300.0,
	} # rate limits (seconds)
	self.metrics_queue = {} # holds metrics for each epoch until upload
	self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed
	self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py

	# Parse input
	api_key, model_id, self.filename = self._parse_identifier(identifier)

	# Get credentials
	active_key = api_key or SETTINGS.get("api_key")
	credentials = {"api_key": active_key} if active_key else None # set credentials

	# Initialize client
	self.client = HUBClient(credentials)

	if model_id:
	self.load_model(model_id) # load existing model
	else:
	self.model = self.client.model() # load empty model

	def load_model(self, model_id):
	"""Loads an existing model from Ultralytics HUB using the provided model identifier."""
	self.model = self.client.model(model_id)
	if not self.model.data: # then model does not exist
	raise ValueError(emojis("❌ The specified HUB model does not exist")) # TODO: improve error handling

	self.model_url = f"{HUB_WEB_ROOT}/models/{self.model.id}"

	self._set_train_args()

	# Start heartbeats for HUB to monitor agent
	self.model.start_heartbeat(self.rate_limits["heartbeat"])
	LOGGER.info(f"{PREFIX}View model at {self.model_url} 🚀")

	def create_model(self, model_args):
	"""Initializes a HUB training session with the specified model identifier."""
	payload = {
	"config": {
	"batchSize": model_args.get("batch", -1),
	"epochs": model_args.get("epochs", 300),
	"imageSize": model_args.get("imgsz", 640),
	"patience": model_args.get("patience", 100),
	"device": model_args.get("device", ""),
	"cache": model_args.get("cache", "ram"),
	},
	"dataset": {"name": model_args.get("data")},
	"lineage": {
	"architecture": {
	"name": self.filename.replace(".pt", "").replace(".yaml", ""),
	},
	"parent": {},
	},
	"meta": {"name": self.filename},
	}

	if self.filename.endswith(".pt"):
	payload["lineage"]["parent"]["name"] = self.filename

	self.model.create_model(payload)

	# Model could not be created
	# TODO: improve error handling
	if not self.model.id:
	return

	self.model_url = f"{HUB_WEB_ROOT}/models/{self.model.id}"

	# Start heartbeats for HUB to monitor agent
	self.model.start_heartbeat(self.rate_limits["heartbeat"])

	LOGGER.info(f"{PREFIX}View model at {self.model_url} 🚀")

	def _parse_identifier(self, identifier):
	"""
	Parses the given identifier to determine the type of identifier and extract relevant components.

	The method supports different identifier formats:
	- A HUB URL, which starts with HUB_WEB_ROOT followed by '/models/'
	- An identifier containing an API key and a model ID separated by an underscore
	- An identifier that is solely a model ID of a fixed length
	- A local filename that ends with '.pt' or '.yaml'

	Args:
	identifier (str): The identifier string to be parsed.

	Returns:
	(tuple): A tuple containing the API key, model ID, and filename as applicable.

	Raises:
	HUBModelError: If the identifier format is not recognized.
	"""

	# Initialize variables
	api_key, model_id, filename = None, None, None

	# Check if identifier is a HUB URL
	if identifier.startswith(f"{HUB_WEB_ROOT}/models/"):
	# Extract the model_id after the HUB_WEB_ROOT URL
	model_id = identifier.split(f"{HUB_WEB_ROOT}/models/")[-1]
	else:
	# Split the identifier based on underscores only if it's not a HUB URL
	parts = identifier.split("_")

	# Check if identifier is in the format of API key and model ID
	if len(parts) == 2 and len(parts[0]) == 42 and len(parts[1]) == 20:
	api_key, model_id = parts
	# Check if identifier is a single model ID
	elif len(parts) == 1 and len(parts[0]) == 20:
	model_id = parts[0]
	# Check if identifier is a local filename
	elif identifier.endswith(".pt") or identifier.endswith(".yaml"):
	filename = identifier
	else:
	raise HUBModelError(
	f"model='{identifier}' could not be parsed. Check format is correct. "
	f"Supported formats are Ultralytics HUB URL, apiKey_modelId, modelId, local pt or yaml file."
	)

	return api_key, model_id, filename

	def _set_train_args(self):
	"""
	Initializes training arguments and creates a model entry on the Ultralytics HUB.

	This method sets up training arguments based on the model's state and updates them with any additional
	arguments provided. It handles different states of the model, such as whether it's resumable, pretrained,
	or requires specific file setup.

	Raises:
	ValueError: If the model is already trained, if required dataset information is missing, or if there are
	issues with the provided training arguments.
	"""
	if self.model.is_trained():
	raise ValueError(emojis(f"Model is already trained and uploaded to {self.model_url} 🚀"))

	if self.model.is_resumable():
	# Model has saved weights
	self.train_args = {"data": self.model.get_dataset_url(), "resume": True}
	self.model_file = self.model.get_weights_url("last")
	else:
	# Model has no saved weights
	self.train_args = self.model.data.get("train_args") # new response

	# Set the model file as either a .pt or .yaml file
	self.model_file = (
	self.model.get_weights_url("parent") if self.model.is_pretrained() else self.model.get_architecture()
	)

	if "data" not in self.train_args:
	# RF bug - datasets are sometimes not exported
	raise ValueError("Dataset may still be processing. Please wait a minute and try again.")

	self.model_file = checks.check_yolov5u_filename(self.model_file, verbose=False) # YOLOv5->YOLOv5u
	self.model_id = self.model.id

	def request_queue(
	self,
	request_func,
	retry=3,
	timeout=30,
	thread=True,
	verbose=True,
	progress_total=None,
	*args,
	**kwargs,
	):
	def retry_request():
	"""Attempts to call `request_func` with retries, timeout, and optional threading."""
	t0 = time.time() # Record the start time for the timeout
	for i in range(retry + 1):
	if (time.time() - t0) > timeout:
	LOGGER.warning(f"{PREFIX}Timeout for request reached. {HELP_MSG}")
	break # Timeout reached, exit loop

	response = request_func(args, *kwargs)
	if response is None:
	LOGGER.warning(f"{PREFIX}Received no response from the request. {HELP_MSG}")
	time.sleep(2**i) # Exponential backoff before retrying
	continue # Skip further processing and retry

	if progress_total:
	self._show_upload_progress(progress_total, response)

	if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
	# if request related to metrics upload
	if kwargs.get("metrics"):
	self.metrics_upload_failed_queue = {}
	return response # Success, no need to retry

	if i == 0:
	# Initial attempt, check status code and provide messages
	message = self._get_failure_message(response, retry, timeout)

	if verbose:
	LOGGER.warning(f"{PREFIX}{message} {HELP_MSG} ({response.status_code})")

	if not self._should_retry(response.status_code):
	LOGGER.warning(f"{PREFIX}Request failed. {HELP_MSG} ({response.status_code}")
	break # Not an error that should be retried, exit loop

	time.sleep(2**i) # Exponential backoff for retries

	# if request related to metrics upload and exceed retries
	if response is None and kwargs.get("metrics"):
	self.metrics_upload_failed_queue.update(kwargs.get("metrics", None))

	return response

	if thread:
	# Start a new thread to run the retry_request function
	threading.Thread(target=retry_request, daemon=True).start()
	else:
	# If running in the main thread, call retry_request directly
	return retry_request()

	def _should_retry(self, status_code):
	"""Determines if a request should be retried based on the HTTP status code."""
	retry_codes = {
	HTTPStatus.REQUEST_TIMEOUT,
	HTTPStatus.BAD_GATEWAY,
	HTTPStatus.GATEWAY_TIMEOUT,
	}
	return status_code in retry_codes

	def _get_failure_message(self, response: requests.Response, retry: int, timeout: int):
	"""
	Generate a retry message based on the response status code.

	Args:
	response: The HTTP response object.
	retry: The number of retry attempts allowed.
	timeout: The maximum timeout duration.

	Returns:
	(str): The retry message.
	"""
	if self._should_retry(response.status_code):
	return f"Retrying {retry}x for {timeout}s." if retry else ""
	elif response.status_code == HTTPStatus.TOO_MANY_REQUESTS: # rate limit
	headers = response.headers
	return (
	f"Rate limit reached ({headers['X-RateLimit-Remaining']}/{headers['X-RateLimit-Limit']}). "
	f"Please retry after {headers['Retry-After']}s."
	)
	else:
	try:
	return response.json().get("message", "No JSON message.")
	except AttributeError:
	return "Unable to read JSON."

	def upload_metrics(self):
	"""Upload model metrics to Ultralytics HUB."""
	return self.request_queue(self.model.upload_metrics, metrics=self.metrics_queue.copy(), thread=True)

	def upload_model(
	self,
	epoch: int,
	weights: str,
	is_best: bool = False,
	map: float = 0.0,
	final: bool = False,
	) -> None:
	"""
	Upload a model checkpoint to Ultralytics HUB.

	Args:
	epoch (int): The current training epoch.
	weights (str): Path to the model weights file.
	is_best (bool): Indicates if the current model is the best one so far.
	map (float): Mean average precision of the model.
	final (bool): Indicates if the model is the final model after training.
	"""
	if Path(weights).is_file():
	progress_total = Path(weights).stat().st_size if final else None # Only show progress if final
	self.request_queue(
	self.model.upload_model,
	epoch=epoch,
	weights=weights,
	is_best=is_best,
	map=map,
	final=final,
	retry=10,
	timeout=3600,
	thread=not final,
	progress_total=progress_total,
	)
	else:
	LOGGER.warning(f"{PREFIX}WARNING ⚠️ Model upload issue. Missing model {weights}.")

	def _show_upload_progress(self, content_length: int, response: requests.Response) -> None:
	"""
	Display a progress bar to track the upload progress of a file download.

	Args:
	content_length (int): The total size of the content to be downloaded in bytes.
	response (requests.Response): The response object from the file download request.

	Returns:
	None
	"""
	with TQDM(total=content_length, unit="B", unit_scale=True, unit_divisor=1024) as pbar:
	for data in response.iter_content(chunk_size=1024):
	pbar.update(len(data))