|
from typing import List, Generator |
|
|
|
from bs4 import BeautifulSoup, Tag |
|
import urllib3 |
|
|
|
SUPPORTED_MODEL_NAME_PAGES_FORMAT: str = "https://huggingface.co/models?pipeline_tag=text-generation&library=pytorch" |
|
|
|
|
|
def get_model_name(model_card: Tag) -> str: |
|
""" |
|
Gets the model name from the model card. |
|
:param model_card: The model card to get the model name from. |
|
:return: The model name. |
|
""" |
|
h4_class = "text-md truncate font-mono text-black dark:group-hover:text-yellow-500 group-hover:text-indigo-600" |
|
h4_tag: Tag = model_card.find("h4", class_=h4_class) |
|
return h4_tag.text |
|
|
|
|
|
def get_soups() -> Generator[BeautifulSoup, None, None]: |
|
""" |
|
Gets the pages to scrape. |
|
:return: A list of the pages to scrape. |
|
""" |
|
curr_page_index = 0 |
|
while True: |
|
curr_page_url = f"{SUPPORTED_MODEL_NAME_PAGES_FORMAT}&p={curr_page_index}" |
|
request = urllib3.PoolManager().request("GET", curr_page_url) |
|
if request.status != 200: |
|
return |
|
yield BeautifulSoup(request.data, "html.parser") |
|
curr_page_index += 1 |
|
|
|
|
|
def get_supported_model_names() -> Generator[str, None, None]: |
|
""" |
|
Scrapes the supported model names from the hugging face website. |
|
:return: A list of the supported model names. |
|
""" |
|
for soup in get_soups(): |
|
model_cards: List[Tag] = soup.find_all("article", class_="overview-card-wrapper group", recursive=True) |
|
for model_card in model_cards: |
|
yield get_model_name(model_card) |
|
|
|
|
|
if __name__ == "__main__": |
|
for model_name in get_supported_model_names(): |
|
print(model_name) |
|
|