In my case, the JSON file was created manually for testing fine-tuning with Unsloth. However, in a real-world scenario with a large collection of PDFs, this code becomes extremely useful for automating the process.

The Below code

# save as prepare_resume_dataset.py
import re
import json
from pathlib import Path
import pdfplumber
from typing import List, Dict

def extract_pdf_text(pdf_path: str) -> str:
    text_parts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            ptext = page.extract_text() or ""
            text_parts.append(ptext)
    return "\n\n".join(text_parts)

def split_sections(text: str) -> Dict[str,str]:
    # Basic regex headings — adjust if resume uses other words
    headings = ["experience", "employment", "work experience",
                "education", "skills", "projects", "certifications", "summary", "profile"]
    # Normalize
    lower = text.lower()
    sections = {}
    # naive: split by heading words found in text
    # We will find indexes of heading occurrences and slice between them.
    matches = []
    for h in headings:
        for m in re.finditer(rf"\b{re.escape(h)}\b", lower):
            matches.append((m.start(), h))
    if not matches:
        # fallback: return whole as 'body'
        return {"body": text}
    matches = sorted(matches)
    for i, (pos, h) in enumerate(matches):
        start = pos
        end = matches[i+1][0] if i+1 < len(matches) else len(text)
        # Extract slice from original text (not lowercased)
        sections[h] = text[start:end].strip()
    return sections

def create_examples(full_text: str, sections: Dict[str,str]) -> List[Dict]:
    examples = []
    # 1) Full-resume summarization
    examples.append({
        "input": "Summarize this resume in 40-60 words.",
        "output": (full_text if len(full_text) < 2000 else full_text[:2000])  # for single-resume, we use the text as 'context'
    })
    # 2) Bullet-style short bio
    examples.append({
        "input": "Write a professional 2-sentence bio for LinkedIn based on this resume.",
        "output": ""  # we'll fill below by heuristics
    })

    # 3) Skill extraction if available
    skills = sections.get("skills", "")
    if skills:
        # normalize bullets
        skills_list = re.split(r"[\n,;•]+", skills)
        skills_list = [s.strip() for s in skills_list if s.strip()]
        examples.append({
            "input": "List the technical skills found in this resume as a JSON array.",
            "output": json.dumps(skills_list, ensure_ascii=False)
        })
    # 4) For each experience chunk, create Q/A like "role and period"
    exp_text = sections.get("experience", "") or sections.get("employment", "")
    if exp_text:
        # split roughly by newlines+double-spaces
        entries = [e.strip() for e in re.split(r"\n{2,}|\r\n{2,}", exp_text) if e.strip()]
        # Limit to N items so dataset doesn't explode
        for i, e in enumerate(entries[:6]):
            examples.append({
                "input": f"Extract role, company and dates from this experience line:\n\n{e}",
                "output": e  # we leave output raw; training teaches extraction mapping
            })

    # 5) Projects (if any)
    if sections.get("projects"):
        projects = [p.strip() for p in re.split(r"\n{2,}", sections["projects"]) if p.strip()]
        for p in projects[:4]:
            examples.append({
                "input": f"Summarize this project in 1 sentence:\n\n{p}",
                "output": p
            })

    # 6) Fallback: chunk the body into smaller segments and ask to compress
    body = sections.get("body", "") or full_text
    # split into ~1000 character chunks
    chunks = [body[i:i+1000] for i in range(0, min(len(body), 5000), 1000)]
    for i, c in enumerate(chunks):
        examples.append({
            "input": "Compress the following into a 30-word summary:\n\n" + c,
            "output": c
        })

    # Now fill bio example output by heuristics (first lines + headline)
    # A simple heuristic — try to find name and headline in first 200 chars
    first_chunk = full_text.strip().splitlines()
    first_lines = [l.strip() for l in first_chunk if l.strip()][:6]
    bio = " ".join(first_lines[:3])
    examples[1]["output"] = bio

    return examples

if __name__ == "__main__":
    pdf_path = "resume.pdf"  # replace with your file
    out_json = "resume_dataset.json"
    text = extract_pdf_text(pdf_path)
    sections = split_sections(text)
    examples = create_examples(text, sections)
    # Save as a list of {"input":..., "output":...}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(examples, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(examples)} examples to {out_json}")

Why these steps?