THUDM
/

chatglm3-6b

@@ -125,8 +125,9 @@ class QuantizedLinear(torch.nn.Module):
     def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
                  **kwargs):
         super().__init__()
         self.weight_bit_width = weight_bit_width
         shape = weight.shape
         if weight is None or empty_init:
@@ -154,7 +155,7 @@ def quantize(model, weight_bit_width, empty_init=False, device=None):
     for layer in model.layers:
         layer.self_attention.query_key_value = QuantizedLinear(
             weight_bit_width=weight_bit_width,
-            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
             bias=layer.self_attention.query_key_value.bias,
             dtype=layer.self_attention.query_key_value.weight.dtype,
             device=layer.self_attention.query_key_value.weight.device if device is None else device,
@@ -162,7 +163,7 @@ def quantize(model, weight_bit_width, empty_init=False, device=None):
         )
         layer.self_attention.dense = QuantizedLinear(
             weight_bit_width=weight_bit_width,
-            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
             bias=layer.self_attention.dense.bias,
             dtype=layer.self_attention.dense.weight.dtype,
             device=layer.self_attention.dense.weight.device if device is None else device,
@@ -170,7 +171,7 @@ def quantize(model, weight_bit_width, empty_init=False, device=None):
         )
         layer.mlp.dense_h_to_4h = QuantizedLinear(
             weight_bit_width=weight_bit_width,
-            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
             bias=layer.mlp.dense_h_to_4h.bias,
             dtype=layer.mlp.dense_h_to_4h.weight.dtype,
             device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
@@ -178,7 +179,7 @@ def quantize(model, weight_bit_width, empty_init=False, device=None):
         )
         layer.mlp.dense_4h_to_h = QuantizedLinear(
             weight_bit_width=weight_bit_width,
-            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
             bias=layer.mlp.dense_4h_to_h.bias,
             dtype=layer.mlp.dense_4h_to_h.weight.dtype,
             device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,

     def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
                  **kwargs):
         super().__init__()
+        assert str(weight.device).startswith('cuda'), 'The weights that need to be quantified should be on the CUDA device'
         self.weight_bit_width = weight_bit_width
         shape = weight.shape
         if weight is None or empty_init:
     for layer in model.layers:
         layer.self_attention.query_key_value = QuantizedLinear(
             weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight,
             bias=layer.self_attention.query_key_value.bias,
             dtype=layer.self_attention.query_key_value.weight.dtype,
             device=layer.self_attention.query_key_value.weight.device if device is None else device,
         )
         layer.self_attention.dense = QuantizedLinear(
             weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight,
             bias=layer.self_attention.dense.bias,
             dtype=layer.self_attention.dense.weight.dtype,
             device=layer.self_attention.dense.weight.device if device is None else device,
         )
         layer.mlp.dense_h_to_4h = QuantizedLinear(
             weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight,
             bias=layer.mlp.dense_h_to_4h.bias,
             dtype=layer.mlp.dense_h_to_4h.weight.dtype,
             device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
         )
         layer.mlp.dense_4h_to_h = QuantizedLinear(
             weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight,
             bias=layer.mlp.dense_4h_to_h.bias,
             dtype=layer.mlp.dense_4h_to_h.weight.dtype,
             device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,