Home > Servers > Rack and Tower Servers > Intel > White Papers > LLM Deployments on Dell R760xa with NVIDIA H100 GPUs and Dell Hugging Face Enterprise Hub > Meta 8B model
The Meta 8B model from the Meta Llama series is designed for efficient and versatile natural language processing tasks.
For users who have followed the setup instructions and have the environment ready, ensure you have the following prerequisites:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
token = "your_huggingface_api_token" # Use your Hugging Face token
model_id = "meta-llama/Meta-Llama-3.1-8b"
try:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
print("Tokenizer loaded successfully.")
except Exception as e:
print(f"Error loading tokenizer: {e}")
try:
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Use appropriate data type for efficient computation
device_map="auto", # Automatically map to available devices
use_auth_token=token
)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
try:
# Accept user input
question = input("Enter your question: ")
# Tokenize input
print("Tokenizing input...")
inputs = tokenizer(question, return_tensors="pt").to(model.device)
print(f"Tokenized input: {inputs}")
# Generate output
print("Generating output...")
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
max_length=100 # Specify the max length of the generated output
)
print("Output generated successfully.")
# Decode and print output
print("Decoding output...")
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Decoded output: {decoded_output}")
except Exception as e:
print(f"Error during processing: {e}")