Fine-Tuning LLMs with LoRA and QLoRA: Complete Guide 2026
Train custom AI models from Llama 3 and Mistral using LoRA/QLoRA fine-tuning on a single consumer GPU with less than 24GB VRAM
Fine-Tuning LLMs with LoRA and QLoRA: Complete Guide 2026
Full fine-tuning of a 7B LLM requires 8 A100 GPUs and $500+ per training run. LoRA (Low-Rank Adaptation) fine-tunes the same model on a single consumer GPU in 2-4 hours for under $5. This technique has democratized custom AI model development.
What Is LoRA?
Instead of updating all model weights, LoRA adds small trainable matrices to specific layers. Only these small matrices (~1% of parameters) are trained and stored. At inference, they're merged back into the original weights.
Result: 7B model fine-tuned with:
When to Fine-Tune vs Prompt Engineering
Setup
bash
pip install transformers datasets peft trl bitsandbytes accelerate wandb
Step 1: Prepare Your Dataset
python
from datasets import Dataset
import jsonTraining data format for instruction tuning
training_examples = [
{
"instruction": "Extract the company name, amount, and date from this press release.",
"input": "Acme Corp announced today the acquisition of StarTech for $45M, closing March 15, 2026.",
"output": '{"company": "Acme Corp", "acquisition_target": "StarTech", "amount": "$45M", "date": "March 15, 2026"}'
},
# Add 500-5000 examples for good results
]def format_instruction(sample):
"""Format as Alpaca-style prompt."""
if sample["input"]:
return f"""Below is an instruction that describes a task, paired with an input. Write a response.
Instruction:
{sample['instruction']}Input:
{sample['input']}Response:
{sample['output']}"""
else:
return f"""Below is an instruction. Write a response.Instruction:
{sample['instruction']}Response:
{sample['output']}"""dataset = Dataset.from_list(training_examples)
dataset = dataset.map(lambda x: {"text": format_instruction(x)})
dataset = dataset.train_test_split(test_size=0.1)
print(f"Train: {len(dataset['train'])} | Test: {len(dataset['test'])}")
Step 2: Configure QLoRA Training
python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainerMODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
4-bit quantization for memory efficiency (QLoRA)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4", # NormalFloat4 - best quality
bnb_4bit_compute_dtype=torch.bfloat16
)Load model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
token="hf_your_token"
)
model.config.use_cache = False
model.config.pretraining_tp = 1tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token="hf_your_token")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
Prepare for k-bit training
model = prepare_model_for_kbit_training(model)
Step 3: LoRA Configuration
python
LoRA config - these settings work well for instruction tuning
peft_config = LoraConfig(
r=64, # Rank: higher = more parameters = better quality but more memory
lora_alpha=16, # Scaling factor (usually lora_alpha = r/4)
target_modules=[ # Which layers to fine-tune
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
Output: trainable params: 83,886,080 || all params: 8,114,933,760 || trainable%: 1.03
Step 4: Training
python
training_args = TrainingArguments(
output_dir="./llama3-finetuned",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4, # Effective batch size = 4 * 4 = 16
gradient_checkpointing=True,
optim="paged_adamw_32bit",
logging_steps=25,
save_strategy="epoch",
learning_rate=2e-4,
weight_decay=0.001,
fp16=False,
bf16=True,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio=0.03,
lr_scheduler_type="constant",
report_to="wandb",
evaluation_strategy="epoch"
)trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
dataset_text_field="text",
tokenizer=tokenizer,
peft_config=peft_config,
max_seq_length=2048
)
trainer.train()
trainer.save_model("./llama3-finetuned-final")
print("Training complete!")
Step 5: Merge and Export
python
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torchLoad base model in full precision for merge
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto"
)Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "./llama3-finetuned-final")Merge adapter weights into base model
merged_model = model.merge_and_unload()Save the merged model
merged_model.save_pretrained("./llama3-finetuned-merged")
tokenizer.save_pretrained("./llama3-finetuned-merged")
print("Merged model saved!")
Step 6: Evaluation
python
def evaluate_model(model, tokenizer, test_cases: list) -> dict:
results = []
for case in test_cases:
prompt = format_instruction({"instruction": case["instruction"], "input": case.get("input", ""), "output": ""})
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.1,
do_sample=True
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = generated.split("### Response:")[-1].strip()
results.append({
"instruction": case["instruction"],
"expected": case["expected_output"],
"actual": response,
"match": response.strip() == case["expected_output"].strip()
})
accuracy = sum(r["match"] for r in results) / len(results)
print(f"Accuracy: {accuracy:.1%}")
return {"accuracy": accuracy, "results": results}
Hardware Requirements
Recommended GPU: NVIDIA RTX 4090 (24GB) for 7-8B models
Real-World Results
Companies using LoRA fine-tuning in production:
Conclusion
LoRA fine-tuning has made custom AI model development accessible. A 1000-example dataset and an RTX 4090 can produce a model that dramatically outperforms GPT-4 on specific domain tasks. The key investment is dataset quality—curate clean, diverse examples that represent your actual use cases.
Also available in 中文.