Fine-Tuning LLM With Unsloth

Below is the script, adapted to `gemma3:270m`, using the resume dataset.

# finetune_gemma_unsloth.py
import os
import json
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

# === CONFIG ===
LOCAL_MODEL_PATH = r"C:\Users\DEEPAK\Documents\Fine Tuning LLM Using Unsloth Hands-On\gemma-3-270m"  # <-- Update this
DATA_JSON = "resume_dataset.json"
MAX_SEQ_LENGTH = 2048
OUTPUT_DIR = "unsloth_outputs_gemma_resume"
USE_4BIT = True        # Use 4bit quantization to save VRAM if supported
LORA_RANK = 8          # LoRA rank for adapter
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 4
NUM_EPOCHS = 4
LEARNING_RATE = 2e-4
SEED = 3407

# Set seed for reproducibility
torch.manual_seed(SEED)

# Check GPU availability
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))

# Load dataset
with open(DATA_JSON, "r", encoding="utf-8") as f:
    examples = json.load(f)

# Format prompts for SFT training — align with your inference prompt style
def format_prompt(example):
    # Format like a chat: "user: <input>\nassistant: <output>"
    # Append <|endoftext|> token if your tokenizer expects it for EOS
    return f"user: {example['input']}\nassistant: {example['output']}<|endoftext|>"

formatted = [format_prompt(e) for e in examples]
dataset = Dataset.from_dict({"text": formatted})

# Load model + tokenizer with unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=LOCAL_MODEL_PATH,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=USE_4BIT,
)

# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
)

# Determine precision flags
fp16_flag = torch.cuda.is_available() and torch.cuda.is_fp16_supported() if hasattr(torch.cuda, 'is_fp16_supported') else False
bf16_flag = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

# Setup training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    warmup_steps=10,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=fp16_flag,
    bf16=bf16_flag,
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=SEED,
    output_dir=OUTPUT_DIR,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_pin_memory=False,
)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=1,
    args=training_args,
)

# Start fine-tuning
trainer_stats = trainer.train()

# Prepare model for inference after training (merge adapters, etc.)
FastLanguageModel.for_inference(model)

# Example test prompt to verify the fine-tuned model
device = "cuda" if torch.cuda.is_available() else "cpu"

test_messages = [
    {"role": "user", "content": "List the technical skills from the provided resume."}
]

inputs = tokenizer.apply_chat_template(
    test_messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device)

outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=256,
    do_sample=False,
    temperature=0.0,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(">>> SAMPLE OUTPUT:\n", response)

# Save fine-tuned model in GGUF format compatible with Ollama and others
gguf_dir = "gguf_model"
os.makedirs(gguf_dir, exist_ok=True)

model.save_pretrained_gguf(gguf_dir, tokenizer, quantization_method="q4_k_m")
print(f"Saved GGUF files to {gguf_dir}")

Key Modifications & Reasoning

LOCAL_MODEL_PATH : Point to your local gemma3:270m folder (or HF id if available). Unsloth will try to load the model with its transformers-compatible loader.
USE_4BIT=True : Reduces VRAM needs via bitsandbytes. Good when fine-tuning on modest GPUs.
LORA_RANK=8 : gemma3:270m is small ; high ranks waste memory and can overfit a single-resume dataset.
dataset : Uses your JSON examples ; format_prompt matches the SFT prompt pattern.
save_pretrained_gguf : Merges LoRA + base and writes GGUF which is required to import into Ollama.

My Dataset

deepak_resume_dataset.json

You must set two things manually

LOCAL_MODEL_PATH → where your Gemma3:270M model is.
DATA_JSON → your dataset file (resume_dataset.json).

The rest you can leave as is unless you want to fine-tune hyperparameters.

Below is the script, adapted to gemma3:270m, using the resume dataset.

Key Modifications & Reasoning

My Dataset

You must set two things manually

Below is the script, adapted to `gemma3:270m`, using the resume dataset.