google
/

gemma-7b-AWQ

@@ -57,22 +57,6 @@ You can find fine-tuning notebooks under the [`examples/` directory](https://hug
 * A script to perform SFT using FSDP on TPU devices
 * A notebook that you can run on a free-tier Google Colab instance to perform SFT on English quotes dataset. You can also find the copy of the notebook [here](https://github.com/huggingface/notebooks/blob/main/peft/gemma_7b_english_quotes.ipynb).
-#### Running the model on a CPU
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
-input_text = "Write me a poem about Machine Learning."
-input_ids = tokenizer(input_text, return_tensors="pt")
-outputs = model.generate(**input_ids)
-print(tokenizer.decode(outputs[0]))
-```
 #### Running the model on a single / multi GPU
@@ -81,8 +65,8 @@ print(tokenizer.decode(outputs[0]))
 # pip install accelerate
 from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto")
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
@@ -99,9 +83,10 @@ print(tokenizer.decode(outputs[0]))
 ```python
 # pip install accelerate
 from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto", torch_dtype=torch.float16)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
@@ -116,46 +101,8 @@ print(tokenizer.decode(outputs[0]))
 # pip install accelerate
 from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto", torch_dtype=torch.bfloat16)
-input_text = "Write me a poem about Machine Learning."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
-print(tokenizer.decode(outputs[0]))
-```
-#### Quantized Versions through `bitsandbytes`
-* _Using 8-bit precision (int8)_
-```python
-# pip install bitsandbytes accelerate
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", quantization_config=quantization_config)
-input_text = "Write me a poem about Machine Learning."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
-print(tokenizer.decode(outputs[0]))
-```
-* _Using 4-bit precision_
-```python
-# pip install bitsandbytes accelerate
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", quantization_config=quantization_config)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

 * A script to perform SFT using FSDP on TPU devices
 * A notebook that you can run on a free-tier Google Colab instance to perform SFT on English quotes dataset. You can also find the copy of the notebook [here](https://github.com/huggingface/notebooks/blob/main/peft/gemma_7b_english_quotes.ipynb).
 #### Running the model on a single / multi GPU
 # pip install accelerate
 from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-AWQ")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-AWQ", device_map="auto")
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 ```python
 # pip install accelerate
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-AWQ")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-AWQ", device_map="auto", torch_dtype=torch.float16)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 # pip install accelerate
 from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-AWQ")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-AWQ", device_map="auto", torch_dtype=torch.bfloat16)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")