[OPTIONAL] PDF Extraction & Dataset Construction

In my case, the JSON file was created manually for testing fine-tuning with Unsloth. However, in a real-world scenario with a large collection of PDFs, this code becomes extremely useful for automating the process.

The Below code

Reads the resume PDF,
Heuristically finds sections (Skills, Experience, Education, Projects),
Creates multiple supervised examples (summaries, skills list, Q/A),
Writes the final dataset to a JSONL or JSON file that the trainer consumes.

# save as prepare_resume_dataset.py
import re
import json
from pathlib import Path
import pdfplumber
from typing import List, Dict

def extract_pdf_text(pdf_path: str) -> str:
    text_parts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            ptext = page.extract_text() or ""
            text_parts.append(ptext)
    return "\n\n".join(text_parts)

def split_sections(text: str) -> Dict[str,str]:
    # Basic regex headings — adjust if resume uses other words
    headings = ["experience", "employment", "work experience",
                "education", "skills", "projects", "certifications", "summary", "profile"]
    # Normalize
    lower = text.lower()
    sections = {}
    # naive: split by heading words found in text
    # We will find indexes of heading occurrences and slice between them.
    matches = []
    for h in headings:
        for m in re.finditer(rf"\b{re.escape(h)}\b", lower):
            matches.append((m.start(), h))
    if not matches:
        # fallback: return whole as 'body'
        return {"body": text}
    matches = sorted(matches)
    for i, (pos, h) in enumerate(matches):
        start = pos
        end = matches[i+1][0] if i+1 < len(matches) else len(text)
        # Extract slice from original text (not lowercased)
        sections[h] = text[start:end].strip()
    return sections

def create_examples(full_text: str, sections: Dict[str,str]) -> List[Dict]:
    examples = []
    # 1) Full-resume summarization
    examples.append({
        "input": "Summarize this resume in 40-60 words.",
        "output": (full_text if len(full_text) < 2000 else full_text[:2000])  # for single-resume, we use the text as 'context'
    })
    # 2) Bullet-style short bio
    examples.append({
        "input": "Write a professional 2-sentence bio for LinkedIn based on this resume.",
        "output": ""  # we'll fill below by heuristics
    })

    # 3) Skill extraction if available
    skills = sections.get("skills", "")
    if skills:
        # normalize bullets
        skills_list = re.split(r"[\n,;•]+", skills)
        skills_list = [s.strip() for s in skills_list if s.strip()]
        examples.append({
            "input": "List the technical skills found in this resume as a JSON array.",
            "output": json.dumps(skills_list, ensure_ascii=False)
        })
    # 4) For each experience chunk, create Q/A like "role and period"
    exp_text = sections.get("experience", "") or sections.get("employment", "")
    if exp_text:
        # split roughly by newlines+double-spaces
        entries = [e.strip() for e in re.split(r"\n{2,}|\r\n{2,}", exp_text) if e.strip()]
        # Limit to N items so dataset doesn't explode
        for i, e in enumerate(entries[:6]):
            examples.append({
                "input": f"Extract role, company and dates from this experience line:\n\n{e}",
                "output": e  # we leave output raw; training teaches extraction mapping
            })

    # 5) Projects (if any)
    if sections.get("projects"):
        projects = [p.strip() for p in re.split(r"\n{2,}", sections["projects"]) if p.strip()]
        for p in projects[:4]:
            examples.append({
                "input": f"Summarize this project in 1 sentence:\n\n{p}",
                "output": p
            })

    # 6) Fallback: chunk the body into smaller segments and ask to compress
    body = sections.get("body", "") or full_text
    # split into ~1000 character chunks
    chunks = [body[i:i+1000] for i in range(0, min(len(body), 5000), 1000)]
    for i, c in enumerate(chunks):
        examples.append({
            "input": "Compress the following into a 30-word summary:\n\n" + c,
            "output": c
        })

    # Now fill bio example output by heuristics (first lines + headline)
    # A simple heuristic — try to find name and headline in first 200 chars
    first_chunk = full_text.strip().splitlines()
    first_lines = [l.strip() for l in first_chunk if l.strip()][:6]
    bio = " ".join(first_lines[:3])
    examples[1]["output"] = bio

    return examples

if __name__ == "__main__":
    pdf_path = "resume.pdf"  # replace with your file
    out_json = "resume_dataset.json"
    text = extract_pdf_text(pdf_path)
    sections = split_sections(text)
    examples = create_examples(text, sections)
    # Save as a list of {"input":..., "output":...}
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(examples, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(examples)} examples to {out_json}")

Why these steps?

A single resume is tiny for fine-tuning. Creating multiple supervision tasks (summaries, extractions, skill lists, project summaries) gives the model different behaviors anchored to the same content, so the adapted weights are meaningful.
Using basic regex sectioning extracts structured signals (skills/experience) which are high-value for downstream tasks.