1import pandas as pd
2import subprocess
3import os
4
5
6df = pd.read_csv('classified_jobs.csv')
7df.columns = [col.strip().lower() for col in df.columns]
8
9
10categories = [
11 "Banking", "Insurance", "General Financial Services", "Healthcare", "Education",
12 "Defence", "Government - Federal", "Government - State", "Retail",
13 "Food and Beverages", "Fast Moving Consumer Goods (FMCG)",
14 "Travel and Tourism", "Airline and Aviation", "Consulting", "Technology",
15 "Telco", "Government - local", "Mining, Resources & Energy", "Others"
16]
17
18
19def classify_with_llm(company, location, job_title, description):
20 prompt = f"""
21You're a classification assistant. Based on the job title and description, classify the job into one of the following categories only:
22{", ".join(categories)}.
23
24Respond with only one category. Do not invent or guess new categories.
25
26Job Title: {job_title}
27Job Description: {description}
28Company Name: {company}
29Location: {location}
30"""
31 try:
32 result = subprocess.run(
33 ["ollama", "run", "gemma3:1b"],
34 input=prompt.encode("utf-8"),
35 stdout=subprocess.PIPE,
36 stderr=subprocess.PIPE,
37 timeout=80
38 )
39 output = result.stdout.decode("utf-8").strip().split("\n")[0]
40 return output if output in categories else "Others"
41 except Exception as e:
42 print(f"⚠️ LLM failed for {job_title[:30]}: {e}")
43 return "Others"
44
45
46output_file = 'classified_jobs_updated_llm.csv'
47if os.path.exists(output_file):
48 final_df = pd.read_csv(output_file)
49 start_idx = len(final_df)
50else:
51 final_df = pd.DataFrame()
52 start_idx = 0
53
54
55batch = []
56for idx in range(start_idx, len(df)):
57 row = df.iloc[idx]
58 current_class = row.get('classified_job', '').strip()
59
60 if current_class.lower() == "others":
61 updated_class = classify_with_llm(
62 row.get('company', ''),
63 row.get('location', ''),
64 row.get('job_title', ''),
65 row.get('description', '')
66 )
67 else:
68 updated_class = current_class
69
70 row_data = row.to_dict()
71 row_data['updated_category'] = updated_class
72 batch.append(row_data)
73
74 if (idx + 1) % 50 == 0 or (idx + 1) == len(df):
75 temp_df = pd.DataFrame(batch)
76 final_df = pd.concat([final_df, temp_df], ignore_index=True)
77 final_df.to_csv(output_file, index=False)
78 batch = []
79 print(f"✅ Saved {idx + 1} rows so far")
80
81print("🎉 Done. All 'Others' reclassified and saved.")