Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import difflib | |
import requests | |
import os | |
import json | |
FIREBASE_URL = os.getenv("FIREBASE_URL") | |
def fetch_from_firebase(model_id, data_type): | |
response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json") | |
if response.status_code == 200: | |
return response.json() | |
return None | |
def save_to_firebase(model_id, data, data_type): | |
response = requests.put( | |
f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data) | |
) | |
return response.status_code == 200 | |
def get_model_structure(model_id) -> list[str]: | |
struct_lines = fetch_from_firebase(model_id, "model_structures") | |
if struct_lines: | |
return struct_lines | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True | |
) | |
structure = {k: str(v.shape) for k, v in model.state_dict().items()} | |
struct_lines = [f"{k}: {v}" for k, v in structure.items()] | |
save_to_firebase(model_id, struct_lines, "model_structures") | |
return struct_lines | |
def get_tokenizer_vocab_size(model_id) -> int: | |
vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes") | |
if vocab_size: | |
return vocab_size | |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
vocab_size = tokenizer.vocab_size | |
save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes") | |
return vocab_size | |
def compare_structures(struct1_lines: list[str], struct2_lines: list[str]): | |
diff = difflib.ndiff(struct1_lines, struct2_lines) | |
return diff | |
def display_diff(diff): | |
left_lines = [] | |
right_lines = [] | |
diff_found = False | |
for line in diff: | |
if line.startswith("- "): | |
left_lines.append( | |
f'<span style="background-color: #ffdddd;">{line[2:]}</span>' | |
) | |
right_lines.append("") | |
diff_found = True | |
elif line.startswith("+ "): | |
right_lines.append( | |
f'<span style="background-color: #ddffdd;">{line[2:]}</span>' | |
) | |
left_lines.append("") | |
diff_found = True | |
elif line.startswith(" "): | |
left_lines.append(line[2:]) | |
right_lines.append(line[2:]) | |
else: | |
pass | |
left_html = "<br>".join(left_lines) | |
right_html = "<br>".join(right_lines) | |
return left_html, right_html, diff_found | |
# Set Streamlit page configuration to wide mode | |
st.set_page_config(layout="wide") | |
# Apply custom CSS for wider layout | |
st.markdown( | |
""" | |
<style> | |
.reportview-container .main .block-container { | |
max-width: 100%; | |
padding-left: 10%; | |
padding-right: 10%; | |
} | |
.stMarkdown { | |
white-space: pre-wrap; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True, | |
) | |
st.title("Model Structure Comparison Tool") | |
model_id1 = st.text_input("Enter the first HuggingFace Model ID") | |
model_id2 = st.text_input("Enter the second HuggingFace Model ID") | |
if st.button("Compare Models"): | |
with st.spinner("Comparing models and loading tokenizers..."): | |
if model_id1 and model_id2: | |
# Get model structures | |
struct1 = get_model_structure(model_id1) | |
struct2 = get_model_structure(model_id2) | |
# Compare model structures | |
diff = compare_structures(struct1, struct2) | |
left_html, right_html, diff_found = display_diff(diff) | |
st.write("### Comparison Result") | |
if not diff_found: | |
st.success("The model structures are identical.") | |
col1, col2 = st.columns( | |
[1.5, 1.5] | |
) # Adjust the ratio to make columns wider | |
with col1: | |
st.write(f"### Model 1: {model_id1}") | |
st.markdown(left_html, unsafe_allow_html=True) | |
with col2: | |
st.write(f"### Model 2: {model_id2}") | |
st.markdown(right_html, unsafe_allow_html=True) | |
# Tokenizer verification | |
try: | |
vocab_size1 = get_tokenizer_vocab_size(model_id1) | |
vocab_size2 = get_tokenizer_vocab_size(model_id2) | |
if vocab_size1 == vocab_size2: | |
st.success("The tokenizer vocab sizes are identical.") | |
else: | |
st.warning("The tokenizer vocab sizes are different.") | |
st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}") | |
st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}") | |
except Exception as e: | |
st.error(f"Error loading tokenizers: {e}") | |
else: | |
st.error("Please enter both model IDs.") | |