End of training

3a25a0a verified 3 months ago

12.5 kB

	# coding=utf-8
	# Copyright 2024 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import collections
	import importlib.util
	import os
	import re
	from pathlib import Path


	PATH_TO_TRANSFORMERS = "src/transformers"


	# Matches is_xxx_available()
	_re_backend = re.compile(r"is\_([a-z_]*)_available()")
	# Catches a one-line _import_struct = {xxx}
	_re_one_line_import_struct = re.compile(r"^_import_structure\s+=\s+\{([^\}]+)\}")
	# Catches a line with a key-values pattern: "bla": ["foo", "bar"]
	_re_import_struct_key_value = re.compile(r'\s+"\S":\s+\[([^\]])\]')
	# Catches a line if not is_foo_available
	_re_test_backend = re.compile(r"^\sif\s+not\s+is\_[a-z_]\_available\(\)")
	# Catches a line _import_struct["bla"].append("foo")
	_re_import_struct_add_one = re.compile(r'^\s_import_structure\["\S"\]\.append\("(\S*)"\)')
	# Catches a line _import_struct["bla"].extend(["foo", "bar"]) or _import_struct["bla"] = ["foo", "bar"]
	_re_import_struct_add_many = re.compile(r"^\s_import_structure\[\S\](?:\.extend\(\|\s=\s+)\[([^\]])\]")
	# Catches a line with an object between quotes and a comma: "MyModel",
	_re_quote_object = re.compile(r'^\s+"([^"]+)",')
	# Catches a line with objects between brackets only: ["foo", "bar"],
	_re_between_brackets = re.compile(r"^\s+\[([^\]]+)\]")
	# Catches a line with from foo import bar, bla, boo
	_re_import = re.compile(r"\s+from\s+\S\s+import\s+([^\(\s].)\n")
	# Catches a line with try:
	_re_try = re.compile(r"^\s*try:")
	# Catches a line with else:
	_re_else = re.compile(r"^\s*else:")


	def find_backend(line):
	"""Find one (or multiple) backend in a code line of the init."""
	if _re_test_backend.search(line) is None:
	return None
	backends = [b[0] for b in _re_backend.findall(line)]
	backends.sort()
	return "_and_".join(backends)


	def parse_init(init_file):
	"""
	Read an init_file and parse (per backend) the _import_structure objects defined and the TYPE_CHECKING objects
	defined
	"""
	with open(init_file, "r", encoding="utf-8", newline="\n") as f:
	lines = f.readlines()

	line_index = 0
	while line_index < len(lines) and not lines[line_index].startswith("_import_structure = {"):
	line_index += 1

	# If this is a traditional init, just return.
	if line_index >= len(lines):
	return None

	# First grab the objects without a specific backend in _import_structure
	objects = []
	while not lines[line_index].startswith("if TYPE_CHECKING") and find_backend(lines[line_index]) is None:
	line = lines[line_index]
	# If we have everything on a single line, let's deal with it.
	if _re_one_line_import_struct.search(line):
	content = _re_one_line_import_struct.search(line).groups()[0]
	imports = re.findall(r"\[([^\]]+)\]", content)
	for imp in imports:
	objects.extend([obj[1:-1] for obj in imp.split(", ")])
	line_index += 1
	continue
	single_line_import_search = _re_import_struct_key_value.search(line)
	if single_line_import_search is not None:
	imports = [obj[1:-1] for obj in single_line_import_search.groups()[0].split(", ") if len(obj) > 0]
	objects.extend(imports)
	elif line.startswith(" " * 8 + '"'):
	objects.append(line[9:-3])
	line_index += 1

	import_dict_objects = {"none": objects}
	# Let's continue with backend-specific objects in _import_structure
	while not lines[line_index].startswith("if TYPE_CHECKING"):
	# If the line is an if not is_backend_available, we grab all objects associated.
	backend = find_backend(lines[line_index])
	# Check if the backend declaration is inside a try block:
	if _re_try.search(lines[line_index - 1]) is None:
	backend = None

	if backend is not None:
	line_index += 1

	# Scroll until we hit the else block of try-except-else
	while _re_else.search(lines[line_index]) is None:
	line_index += 1

	line_index += 1

	objects = []
	# Until we unindent, add backend objects to the list
	while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 4):
	line = lines[line_index]
	if _re_import_struct_add_one.search(line) is not None:
	objects.append(_re_import_struct_add_one.search(line).groups()[0])
	elif _re_import_struct_add_many.search(line) is not None:
	imports = _re_import_struct_add_many.search(line).groups()[0].split(", ")
	imports = [obj[1:-1] for obj in imports if len(obj) > 0]
	objects.extend(imports)
	elif _re_between_brackets.search(line) is not None:
	imports = _re_between_brackets.search(line).groups()[0].split(", ")
	imports = [obj[1:-1] for obj in imports if len(obj) > 0]
	objects.extend(imports)
	elif _re_quote_object.search(line) is not None:
	objects.append(_re_quote_object.search(line).groups()[0])
	elif line.startswith(" " * 8 + '"'):
	objects.append(line[9:-3])
	elif line.startswith(" " * 12 + '"'):
	objects.append(line[13:-3])
	line_index += 1

	import_dict_objects[backend] = objects
	else:
	line_index += 1

	# At this stage we are in the TYPE_CHECKING part, first grab the objects without a specific backend
	objects = []
	while (
	line_index < len(lines)
	and find_backend(lines[line_index]) is None
	and not lines[line_index].startswith("else")
	):
	line = lines[line_index]
	single_line_import_search = _re_import.search(line)
	if single_line_import_search is not None:
	objects.extend(single_line_import_search.groups()[0].split(", "))
	elif line.startswith(" " * 8):
	objects.append(line[8:-2])
	line_index += 1

	type_hint_objects = {"none": objects}
	# Let's continue with backend-specific objects
	while line_index < len(lines):
	# If the line is an if is_backend_available, we grab all objects associated.
	backend = find_backend(lines[line_index])
	# Check if the backend declaration is inside a try block:
	if _re_try.search(lines[line_index - 1]) is None:
	backend = None

	if backend is not None:
	line_index += 1

	# Scroll until we hit the else block of try-except-else
	while _re_else.search(lines[line_index]) is None:
	line_index += 1

	line_index += 1

	objects = []
	# Until we unindent, add backend objects to the list
	while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 8):
	line = lines[line_index]
	single_line_import_search = _re_import.search(line)
	if single_line_import_search is not None:
	objects.extend(single_line_import_search.groups()[0].split(", "))
	elif line.startswith(" " * 12):
	objects.append(line[12:-2])
	line_index += 1

	type_hint_objects[backend] = objects
	else:
	line_index += 1

	return import_dict_objects, type_hint_objects


	def analyze_results(import_dict_objects, type_hint_objects):
	"""
	Analyze the differences between _import_structure objects and TYPE_CHECKING objects found in an init.
	"""

	def find_duplicates(seq):
	return [k for k, v in collections.Counter(seq).items() if v > 1]

	if list(import_dict_objects.keys()) != list(type_hint_objects.keys()):
	return ["Both sides of the init do not have the same backends!"]

	errors = []
	for key in import_dict_objects.keys():
	duplicate_imports = find_duplicates(import_dict_objects[key])
	if duplicate_imports:
	errors.append(f"Duplicate _import_structure definitions for: {duplicate_imports}")
	duplicate_type_hints = find_duplicates(type_hint_objects[key])
	if duplicate_type_hints:
	errors.append(f"Duplicate TYPE_CHECKING objects for: {duplicate_type_hints}")

	if sorted(set(import_dict_objects[key])) != sorted(set(type_hint_objects[key])):
	name = "base imports" if key == "none" else f"{key} backend"
	errors.append(f"Differences for {name}:")
	for a in type_hint_objects[key]:
	if a not in import_dict_objects[key]:
	errors.append(f" {a} in TYPE_HINT but not in _import_structure.")
	for a in import_dict_objects[key]:
	if a not in type_hint_objects[key]:
	errors.append(f" {a} in _import_structure but not in TYPE_HINT.")
	return errors


	def check_all_inits():
	"""
	Check all inits in the transformers repo and raise an error if at least one does not define the same objects in
	both halves.
	"""
	failures = []
	for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
	if "__init__.py" in files:
	fname = os.path.join(root, "__init__.py")
	objects = parse_init(fname)
	if objects is not None:
	errors = analyze_results(*objects)
	if len(errors) > 0:
	errors[0] = f"Problem in {fname}, both halves do not define the same objects.\n{errors[0]}"
	failures.append("\n".join(errors))
	if len(failures) > 0:
	raise ValueError("\n\n".join(failures))


	def get_transformers_submodules():
	"""
	Returns the list of Transformers submodules.
	"""
	submodules = []
	for path, directories, files in os.walk(PATH_TO_TRANSFORMERS):
	for folder in directories:
	# Ignore private modules
	if folder.startswith("_"):
	directories.remove(folder)
	continue
	# Ignore leftovers from branches (empty folders apart from pycache)
	if len(list((Path(path) / folder).glob("*.py"))) == 0:
	continue
	short_path = str((Path(path) / folder).relative_to(PATH_TO_TRANSFORMERS))
	submodule = short_path.replace(os.path.sep, ".")
	submodules.append(submodule)
	for fname in files:
	if fname == "__init__.py":
	continue
	short_path = str((Path(path) / fname).relative_to(PATH_TO_TRANSFORMERS))
	submodule = short_path.replace(".py", "").replace(os.path.sep, ".")
	if len(submodule.split(".")) == 1:
	submodules.append(submodule)
	return submodules


	IGNORE_SUBMODULES = [
	"convert_pytorch_checkpoint_to_tf2",
	"modeling_flax_pytorch_utils",
	]


	def check_submodules():
	# This is to make sure the transformers module imported is the one in the repo.
	spec = importlib.util.spec_from_file_location(
	"transformers",
	os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
	submodule_search_locations=[PATH_TO_TRANSFORMERS],
	)
	transformers = spec.loader.load_module()

	module_not_registered = [
	module
	for module in get_transformers_submodules()
	if module not in IGNORE_SUBMODULES and module not in transformers._import_structure.keys()
	]
	if len(module_not_registered) > 0:
	list_of_modules = "\n".join(f"- {module}" for module in module_not_registered)
	raise ValueError(
	"The following submodules are not properly registered in the main init of Transformers:\n"
	f"{list_of_modules}\n"
	"Make sure they appear somewhere in the keys of `_import_structure` with an empty list as value."
	)


	if __name__ == "__main__":
	check_all_inits()
	check_submodules()