import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import difflib
import requests
import os
import json
FIREBASE_URL = os.getenv("FIREBASE_URL")
def fetch_from_firebase(model_id, data_type):
response = requests.get(f"{FIREBASE_URL}/{data_type}/{model_id}.json")
if response.status_code == 200:
return response.json()
return None
def save_to_firebase(model_id, data, data_type):
response = requests.put(
f"{FIREBASE_URL}/{data_type}/{model_id}.json", data=json.dumps(data)
)
return response.status_code == 200
def get_model_structure(model_id) -> list[str]:
struct_lines = fetch_from_firebase(model_id, "model_structures")
if struct_lines:
return struct_lines
model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True
)
structure = {k: str(v.shape) for k, v in model.state_dict().items()}
struct_lines = [f"{k}: {v}" for k, v in structure.items()]
save_to_firebase(model_id, struct_lines, "model_structures")
return struct_lines
def get_tokenizer_vocab_size(model_id) -> int:
vocab_size = fetch_from_firebase(model_id, "tokenizer_vocab_sizes")
if vocab_size:
return vocab_size
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
vocab_size = tokenizer.vocab_size
save_to_firebase(model_id, vocab_size, "tokenizer_vocab_sizes")
return vocab_size
def compare_structures(struct1_lines: list[str], struct2_lines: list[str]):
diff = difflib.ndiff(struct1_lines, struct2_lines)
return diff
def display_diff(diff):
left_lines = []
right_lines = []
diff_found = False
for line in diff:
if line.startswith("- "):
left_lines.append(
f'{line[2:]}'
)
right_lines.append("")
diff_found = True
elif line.startswith("+ "):
right_lines.append(
f'{line[2:]}'
)
left_lines.append("")
diff_found = True
elif line.startswith(" "):
left_lines.append(line[2:])
right_lines.append(line[2:])
else:
pass
left_html = "
".join(left_lines)
right_html = "
".join(right_lines)
return left_html, right_html, diff_found
# Set Streamlit page configuration to wide mode
st.set_page_config(layout="wide")
# Apply custom CSS for wider layout
st.markdown(
"""
""",
unsafe_allow_html=True,
)
st.title("Model Structure Comparison Tool")
model_id1 = st.text_input("Enter the first HuggingFace Model ID")
model_id2 = st.text_input("Enter the second HuggingFace Model ID")
if st.button("Compare Models"):
with st.spinner("Comparing models and loading tokenizers..."):
if model_id1 and model_id2:
# Get model structures
struct1 = get_model_structure(model_id1)
struct2 = get_model_structure(model_id2)
# Compare model structures
diff = compare_structures(struct1, struct2)
left_html, right_html, diff_found = display_diff(diff)
st.write("### Comparison Result")
if not diff_found:
st.success("The model structures are identical.")
col1, col2 = st.columns(
[1.5, 1.5]
) # Adjust the ratio to make columns wider
with col1:
st.write(f"### Model 1: {model_id1}")
st.markdown(left_html, unsafe_allow_html=True)
with col2:
st.write(f"### Model 2: {model_id2}")
st.markdown(right_html, unsafe_allow_html=True)
# Tokenizer verification
try:
vocab_size1 = get_tokenizer_vocab_size(model_id1)
vocab_size2 = get_tokenizer_vocab_size(model_id2)
if vocab_size1 == vocab_size2:
st.success("The tokenizer vocab sizes are identical.")
else:
st.warning("The tokenizer vocab sizes are different.")
st.write(f"**{model_id1} Tokenizer Vocab Size**: {vocab_size1}")
st.write(f"**{model_id2} Tokenizer Vocab Size**: {vocab_size2}")
except Exception as e:
st.error(f"Error loading tokenizers: {e}")
else:
st.error("Please enter both model IDs.")