Ellight
/

gemma-2b-bnb-4bit

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

Ellight commited on May 7

Commit

4a47734

•

1 Parent(s): 41d1dbb

Update README.md

Files changed (1) hide show

README.md +7 -0

README.md CHANGED Viewed

@@ -28,12 +28,18 @@ Hindi-Gemma-2B-instruct is an instruction-tuned Hindi large language model (LLM)
 # TO do inference using the LORA adapters
 from unsloth import FastLanguageModel
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name = "Ellight/gemma-2b-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
     max_seq_length = max_seq_length,
     dtype = dtype,
     load_in_4bit = load_in_4bit,
 )
 FastLanguageModel.for_inference(model) # Enable native 2x faster inference
 prompt = """
@@ -53,4 +59,5 @@ inputs = tokenizer(
 ], return_tensors = "pt").to("cuda")
 outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
 tokenizer.batch_decode(outputs)

 # TO do inference using the LORA adapters
 from unsloth import FastLanguageModel
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name = "Ellight/gemma-2b-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
     max_seq_length = max_seq_length,
     dtype = dtype,
     load_in_4bit = load_in_4bit,
 )
 FastLanguageModel.for_inference(model) # Enable native 2x faster inference
 prompt = """
 ], return_tensors = "pt").to("cuda")
 outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
 tokenizer.batch_decode(outputs)