```python from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig import torch if torch.cuda.is_available(): device = "cuda" else: device = "cpu" # Model will be automatically downloaded from HuggingFace model hub if not cached. # Model files will be cached in "~/.cache/huggingface/hub/models--NECOUDBFM--Jellyfish/" by default. # You can also download the model manually and replace the model name with the path to the model files. model = AutoModelForCausalLM.from_pretrained( "NECOUDBFM/Jellyfish", torch_dtype=torch.float16, device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained("NECOUDBFM/Jellyfish") system_message = "You are an AI assistant that follows instruction extremely well. Help as much as you can." # You need to define the user_message variable based on the task and the data you want to test on. user_message = "Hello, world." prompt = f"{system_message}\n\n### Instruction:\n\n{user_message}\n\n### Response:\n\n" inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) # You can modify the sampling parameters according to your needs. generation_config = GenerationConfig( do_samples=True, temperature=0.35, top_p=0.9, ) with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=1024, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.15, ) output = generation_output[0] response = tokenizer.decode( output[:, input_ids.shape[-1] :][0], skip_special_tokens=True ).strip() print(response) ```

```python from vllm import LLM, SamplingParams # To use vllm for inference, you need to download the model files either using HuggingFace model hub or manually. # You should modify the path to the model according to your local environment. path_to_model = ( "/workspace/models/Jellyfish" ) model = LLM(model=path_to_model) # You can modify the sampling parameters according to your needs. # Caution: The stop parameter should not be changed. sampling_params = SamplingParams( temperature=0.35, top_p=0.9, max_tokens=1024, stop=["### Instruction:"], ) system_message = "You are an AI assistant that follows instruction extremely well. Help as much as you can." # You need to define the user_message variable based on the task and the data you want to test on. user_message = "Hello, world." prompt = f"{system_message}\n\n### Instruction:\n\n{user_message}\n\n### Response:\n\n" outputs = model.generate(prompt, sampling_params) response = outputs[0].outputs[0].text.strip() print(response) ```