https://huggingface.co:443 "HEAD /lmms-lab/llavanext-qwen-tokenizer/resolve/main/processor_config.json HTTP/1.1" 404 0
Hi, Looks like a 404 error. And the file is named preprocessor_config.json, not processor_config.json here are the logs:
/usr/local/lib/python3.11/site-packages/transformers/models/llava/configuration_llava.py:143: FutureWarning: The vocab_size
attribute is deprecated and will be removed in v4.42, Please use text_config.vocab_size
instead.
warnings.warn(
/usr/local/lib/python3.11/site-packages/transformers/models/llava/configuration_llava.py:143: FutureWarning: The vocab_size
attribute is deprecated and will be removed in v4.42, Please use text_config.vocab_size
instead.
warnings.warn(
/usr/local/lib/python3.11/site-packages/transformers/models/llava/configuration_llava.py:143: FutureWarning: The vocab_size
attribute is deprecated and will be removed in v4.42, Please use text_config.vocab_size
instead.
warnings.warn(
/usr/local/lib/python3.11/site-packages/transformers/models/llava/configuration_llava.py:143: FutureWarning: The vocab_size
attribute is deprecated and will be removed in v4.42, Please use text_config.vocab_size
instead.
warnings.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
https://huggingface.co:443 "HEAD /lmms-lab/llavanext-qwen-tokenizer/resolve/main/processor_config.json HTTP/1.1" 404 0
This tokenizer repo is used for llava qwen on sglang. I don't think you can load it using the llava model in hf
This tokenizer repo is used for llava qwen on sglang. I don't think you can load it using the llava model in hf
@KC I'm loading it like this:
runtime = sgl.Runtime(
model_path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
tp_size=GPU_CONFIG.count,
log_evel="debug",
enable_flashinfer=True,
)
sgl.set_default_backend(runtime)
Can not reproduce using the srt_example_llava
"""
Usage: python3 srt_example_llava.py
"""
import sglang as sgl
@sgl
.function
def image_qa(s, image_path, question):
s += sgl.user(sgl.image(image_path) + question)
s += sgl.assistant(sgl.gen("answer"))
def single():
state = image_qa.run(
image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=128)
print(state["answer"], "\n")
def stream():
state = image_qa.run(
image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64,
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = image_qa.run_batch(
[
{"image_path": "images/cat.jpeg", "question":"What is this?"},
{"image_path": "images/dog.jpeg", "question":"What is this?"},
],
max_new_tokens=128,
)
for s in states:
print(s["answer"], "\n")
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="lmms-lab/llava-next-72b",
tokenizer_path="lmms-lab/llavanext-qwen-tokenizer", tp_size=8)
sgl.set_default_backend(runtime)
print(f"chat template: {runtime.endpoint.chat_template.name}")
# Or you can use API models
# sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview"))
# sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
If it is internet related error you should check your internet settings
Can not reproduce using the srt_example_llava
""" Usage: python3 srt_example_llava.py """ import sglang as sgl @sgl .function def image_qa(s, image_path, question): s += sgl.user(sgl.image(image_path) + question) s += sgl.assistant(sgl.gen("answer")) def single(): state = image_qa.run( image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128) print(state["answer"], "\n") def stream(): state = image_qa.run( image_path="images/cat.jpeg", question="What is this?", max_new_tokens=64, stream=True) for out in state.text_iter("answer"): print(out, end="", flush=True) print() def batch(): states = image_qa.run_batch( [ {"image_path": "images/cat.jpeg", "question":"What is this?"}, {"image_path": "images/dog.jpeg", "question":"What is this?"}, ], max_new_tokens=128, ) for s in states: print(s["answer"], "\n") if __name__ == "__main__": runtime = sgl.Runtime(model_path="lmms-lab/llava-next-72b", tokenizer_path="lmms-lab/llavanext-qwen-tokenizer", tp_size=8) sgl.set_default_backend(runtime) print(f"chat template: {runtime.endpoint.chat_template.name}") # Or you can use API models # sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview")) # sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision")) # Run a single request print("\n========== single ==========\n") single() # Stream output print("\n========== stream ==========\n") stream() # Run a batch of requests print("\n========== batch ==========\n") batch() runtime.shutdown()
If it is internet related error you should check your internet settings
@kcz358 Can you please sign up for Modal and run this script? They will give you $30 free credits!
import modal
GPU_CONFIG = modal.gpu.A100(size="80GB", count=4)
MINUTES = 60
MODEL_PATH = "lmms-lab/llava-next-72b"
TOKENIZER_PATH = "lmms-lab/llavanext-qwen-tokenizer"
def download_model():
import sglang as sgl
sgl.Runtime(
model_path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
tp_size=GPU_CONFIG.count,
log_evel="debug",
)
vllm_image = (
modal.Image.from_registry(
"nvidia/cuda:12.2.0-devel-ubuntu22.04", add_python="3.11")
.apt_install("git", "wget", "cmake")
.pip_install(
"wheel==0.43.0",
"torch==2.3.0",
"torchvision==0.18.0",
"transformers==4.40.2",
"vllm==0.4.2",
"timm==0.9.12",
"Pillow==10.3.0",
"hf-transfer==0.1.6",
"huggingface_hub==0.22.2",
"requests==2.31.0",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_commands("pip install flash-attn --no-build-isolation")
.run_commands("pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git")
.run_commands(
"git clone https://github.com/sgl-project/sglang.git && cd sglang && pip install -e 'python[all]'"
)
# .run_function(download_model, gpu=GPU_CONFIG)
.run_commands("pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/")
)
app = modal.App("app")
@app
.cls(
gpu=GPU_CONFIG,
timeout=20 * MINUTES,
container_idle_timeout=20 * MINUTES,
allow_concurrent_inputs=10,
image=vllm_image,
)
class Model:
@modal
.enter()
async def start_engine(self):
import sglang as sgl
import subprocess
subprocess.run(["nvidia-smi", "-L"])
runtime = sgl.Runtime(
model_path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
tp_size=GPU_CONFIG.count,
log_evel="debug",
enable_flashinfer=True,
)
sgl.set_default_backend(runtime)
@modal
.method()
async def generate(self):
print("Generating...")
import sglang as sgl
import requests
response = requests.get(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
)
with open("./nyc.png", "wb") as file:
file.write(response.content)
@sgl
.function
def image_qa(s, image_path, question):
s += sgl.user(sgl.image(image_path) + question)
s += sgl.assistant(sgl.gen("answer"))
state = image_qa.run(
image_path="./nyc.png", question="What is this?", max_new_tokens=128
)
print(state["answer"], "\n")
@app
.local_entrypoint()
def main():
Model().generate.remote()