gemma3:270m, using the resume dataset.# finetune_gemma_unsloth.py
import os
import json
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
# === CONFIG ===
LOCAL_MODEL_PATH = r"C:\Users\DEEPAK\Documents\Fine Tuning LLM Using Unsloth Hands-On\gemma-3-270m" # <-- Update this
DATA_JSON = "resume_dataset.json"
MAX_SEQ_LENGTH = 2048
OUTPUT_DIR = "unsloth_outputs_gemma_resume"
USE_4BIT = True # Use 4bit quantization to save VRAM if supported
LORA_RANK = 8 # LoRA rank for adapter
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 4
NUM_EPOCHS = 4
LEARNING_RATE = 2e-4
SEED = 3407
# Set seed for reproducibility
torch.manual_seed(SEED)
# Check GPU availability
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("Device:", torch.cuda.get_device_name(0))
# Load dataset
with open(DATA_JSON, "r", encoding="utf-8") as f:
examples = json.load(f)
# Format prompts for SFT training — align with your inference prompt style
def format_prompt(example):
# Format like a chat: "user: <input>\nassistant: <output>"
# Append <|endoftext|> token if your tokenizer expects it for EOS
return f"user: {example['input']}\nassistant: {example['output']}<|endoftext|>"
formatted = [format_prompt(e) for e in examples]
dataset = Dataset.from_dict({"text": formatted})
# Load model + tokenizer with unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=LOCAL_MODEL_PATH,
max_seq_length=MAX_SEQ_LENGTH,
dtype=None,
load_in_4bit=USE_4BIT,
)
# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
model,
r=LORA_RANK,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=32,
lora_dropout=0.05,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=SEED,
use_rslora=False,
)
# Determine precision flags
fp16_flag = torch.cuda.is_available() and torch.cuda.is_fp16_supported() if hasattr(torch.cuda, 'is_fp16_supported') else False
bf16_flag = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
# Setup training arguments
training_args = TrainingArguments(
per_device_train_batch_size=PER_DEVICE_BATCH,
gradient_accumulation_steps=GRAD_ACCUM,
warmup_steps=10,
num_train_epochs=NUM_EPOCHS,
learning_rate=LEARNING_RATE,
fp16=fp16_flag,
bf16=bf16_flag,
logging_steps=10,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=SEED,
output_dir=OUTPUT_DIR,
save_strategy="epoch",
save_total_limit=2,
dataloader_pin_memory=False,
)
# Initialize the trainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=MAX_SEQ_LENGTH,
dataset_num_proc=1,
args=training_args,
)
# Start fine-tuning
trainer_stats = trainer.train()
# Prepare model for inference after training (merge adapters, etc.)
FastLanguageModel.for_inference(model)
# Example test prompt to verify the fine-tuned model
device = "cuda" if torch.cuda.is_available() else "cpu"
test_messages = [
{"role": "user", "content": "List the technical skills from the provided resume."}
]
inputs = tokenizer.apply_chat_template(
test_messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device)
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=256,
do_sample=False,
temperature=0.0,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(">>> SAMPLE OUTPUT:\n", response)
# Save fine-tuned model in GGUF format compatible with Ollama and others
gguf_dir = "gguf_model"
os.makedirs(gguf_dir, exist_ok=True)
model.save_pretrained_gguf(gguf_dir, tokenizer, quantization_method="q4_k_m")
print(f"Saved GGUF files to {gguf_dir}")
LOCAL_MODEL_PATH : Point to your local gemma3:270m folder (or HF id if available). Unsloth will try to load the model with its transformers-compatible loader.USE_4BIT=True : Reduces VRAM needs via bitsandbytes. Good when fine-tuning on modest GPUs.LORA_RANK=8 : gemma3:270m is small ; high ranks waste memory and can overfit a single-resume dataset.dataset : Uses your JSON examples ; format_prompt matches the SFT prompt pattern.save_pretrained_gguf : Merges LoRA + base and writes GGUF which is required to import into Ollama.LOCAL_MODEL_PATH → where your Gemma3:270M model is.DATA_JSON → your dataset file (resume_dataset.json).The rest you can leave as is unless you want to fine-tune hyperparameters.