Justin Donaldson, Ph.D.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
torch.backends.quantized.engine = "qnnpack"
# Load the pre-trained model and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Apply dynamic quantization to the model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Create a pipeline with the quantized model
quantized_pipeline = pipeline(
    "text-generation", 
    model=quantized_model, 
    tokenizer=tokenizer,
    device = -1
)

normal_pipeline = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    device = -1
)

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.numel() * param.element_size()

    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.numel() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

print('model', get_model_size(model))
print('quantized', get_model_size(quantized_model))

model 30633.023681640625
quantized 2005.023681640625

def test_pipeline(pipe, prompt="Tell a dad joke that involves socks with sandals"):
    # Test the quantized model
    output = pipe(prompt, max_length=50)
    print(output)
test_pipeline(quantized_pipeline)
test_pipeline(normal_pipeline)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

[{'generated_text': "Tell a dad joke that involves socks with sandals\nHere's one:\n\nWhy did the sock go to the party with the sandal?\n\nBecause it was a sole-ful occasion! (get it? sole, like the bottom of the foot,"}]
[{'generated_text': "Tell a dad joke that involves socks with sandals\nHere's one: Why did the sock go with the sandal? Because it was a sole-ful match! (get it? sole-ful, like soulful, but also a reference"}]