In my case, the JSON file was created manually for testing fine-tuning with Unsloth. However, in a real-world scenario with a large collection of PDFs, this code becomes extremely useful for automating the process.
The Below code
- Reads the resume PDF,
- Heuristically finds sections (Skills, Experience, Education, Projects),
- Creates multiple supervised examples (summaries, skills list, Q/A),
- Writes the final dataset to a JSONL or JSON file that the trainer consumes.
# save as prepare_resume_dataset.py
import re
import json
from pathlib import Path
import pdfplumber
from typing import List, Dict
def extract_pdf_text(pdf_path: str) -> str:
text_parts = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
ptext = page.extract_text() or ""
text_parts.append(ptext)
return "\n\n".join(text_parts)
def split_sections(text: str) -> Dict[str,str]:
# Basic regex headings — adjust if resume uses other words
headings = ["experience", "employment", "work experience",
"education", "skills", "projects", "certifications", "summary", "profile"]
# Normalize
lower = text.lower()
sections = {}
# naive: split by heading words found in text
# We will find indexes of heading occurrences and slice between them.
matches = []
for h in headings:
for m in re.finditer(rf"\b{re.escape(h)}\b", lower):
matches.append((m.start(), h))
if not matches:
# fallback: return whole as 'body'
return {"body": text}
matches = sorted(matches)
for i, (pos, h) in enumerate(matches):
start = pos
end = matches[i+1][0] if i+1 < len(matches) else len(text)
# Extract slice from original text (not lowercased)
sections[h] = text[start:end].strip()
return sections
def create_examples(full_text: str, sections: Dict[str,str]) -> List[Dict]:
examples = []
# 1) Full-resume summarization
examples.append({
"input": "Summarize this resume in 40-60 words.",
"output": (full_text if len(full_text) < 2000 else full_text[:2000]) # for single-resume, we use the text as 'context'
})
# 2) Bullet-style short bio
examples.append({
"input": "Write a professional 2-sentence bio for LinkedIn based on this resume.",
"output": "" # we'll fill below by heuristics
})
# 3) Skill extraction if available
skills = sections.get("skills", "")
if skills:
# normalize bullets
skills_list = re.split(r"[\n,;•]+", skills)
skills_list = [s.strip() for s in skills_list if s.strip()]
examples.append({
"input": "List the technical skills found in this resume as a JSON array.",
"output": json.dumps(skills_list, ensure_ascii=False)
})
# 4) For each experience chunk, create Q/A like "role and period"
exp_text = sections.get("experience", "") or sections.get("employment", "")
if exp_text:
# split roughly by newlines+double-spaces
entries = [e.strip() for e in re.split(r"\n{2,}|\r\n{2,}", exp_text) if e.strip()]
# Limit to N items so dataset doesn't explode
for i, e in enumerate(entries[:6]):
examples.append({
"input": f"Extract role, company and dates from this experience line:\n\n{e}",
"output": e # we leave output raw; training teaches extraction mapping
})
# 5) Projects (if any)
if sections.get("projects"):
projects = [p.strip() for p in re.split(r"\n{2,}", sections["projects"]) if p.strip()]
for p in projects[:4]:
examples.append({
"input": f"Summarize this project in 1 sentence:\n\n{p}",
"output": p
})
# 6) Fallback: chunk the body into smaller segments and ask to compress
body = sections.get("body", "") or full_text
# split into ~1000 character chunks
chunks = [body[i:i+1000] for i in range(0, min(len(body), 5000), 1000)]
for i, c in enumerate(chunks):
examples.append({
"input": "Compress the following into a 30-word summary:\n\n" + c,
"output": c
})
# Now fill bio example output by heuristics (first lines + headline)
# A simple heuristic — try to find name and headline in first 200 chars
first_chunk = full_text.strip().splitlines()
first_lines = [l.strip() for l in first_chunk if l.strip()][:6]
bio = " ".join(first_lines[:3])
examples[1]["output"] = bio
return examples
if __name__ == "__main__":
pdf_path = "resume.pdf" # replace with your file
out_json = "resume_dataset.json"
text = extract_pdf_text(pdf_path)
sections = split_sections(text)
examples = create_examples(text, sections)
# Save as a list of {"input":..., "output":...}
with open(out_json, "w", encoding="utf-8") as f:
json.dump(examples, f, indent=2, ensure_ascii=False)
print(f"Saved {len(examples)} examples to {out_json}")
Why these steps?
- A single resume is tiny for fine-tuning. Creating multiple supervision tasks (summaries, extractions, skill lists, project summaries) gives the model different behaviors anchored to the same content, so the adapted weights are meaningful.
- Using basic regex sectioning extracts structured signals (skills/experience) which are high-value for downstream tasks.