AusBiz Consulting

Python

1import pandas as pd
2import subprocess
3import os
4
5# Load data
6df = pd.read_csv('classified_jobs.csv')
7df.columns = [col.strip().lower() for col in df.columns]
8
9# Categories to validate LLM output
10categories = [
11    "Banking", "Insurance", "General Financial Services", "Healthcare", "Education",
12    "Defence", "Government - Federal", "Government - State", "Retail",
13    "Food and Beverages", "Fast Moving Consumer Goods (FMCG)",
14    "Travel and Tourism", "Airline and Aviation", "Consulting", "Technology",
15    "Telco", "Government - local", "Mining, Resources & Energy", "Others"
16]
17
18# LLM-based classification for only "Others"
19def classify_with_llm(company, location, job_title, description):
20    prompt = f"""
21You're a classification assistant. Based on the job title and description, classify the job into one of the following categories only:
22{", ".join(categories)}.
23
24Respond with only one category. Do not invent or guess new categories.
25
26Job Title: {job_title}
27Job Description: {description}
28Company Name: {company}
29Location: {location}
30"""
31    try:
32        result = subprocess.run(
33            ["ollama", "run", "gemma3:1b"],
34            input=prompt.encode("utf-8"),
35            stdout=subprocess.PIPE,
36            stderr=subprocess.PIPE,
37            timeout=80
38        )
39        output = result.stdout.decode("utf-8").strip().split("\n")[0]
40        return output if output in categories else "Others"
41    except Exception as e:
42        print(f"⚠️ LLM failed for {job_title[:30]}: {e}")
43        return "Others"
44
45# Output file setup
46output_file = 'classified_jobs_updated_llm.csv'
47if os.path.exists(output_file):
48    final_df = pd.read_csv(output_file)
49    start_idx = len(final_df)
50else:
51    final_df = pd.DataFrame()
52    start_idx = 0
53
54# Processing loop
55batch = []
56for idx in range(start_idx, len(df)):
57    row = df.iloc[idx]
58    current_class = row.get('classified_job', '').strip()
59
60    if current_class.lower() == "others":
61        updated_class = classify_with_llm(
62            row.get('company', ''),
63            row.get('location', ''),
64            row.get('job_title', ''),
65            row.get('description', '')
66        )
67    else:
68        updated_class = current_class
69
70    row_data = row.to_dict()
71    row_data['updated_category'] = updated_class  # Optional: store the updated classification
72    batch.append(row_data)
73
74    if (idx + 1) % 50 == 0 or (idx + 1) == len(df):
75        temp_df = pd.DataFrame(batch)
76        final_df = pd.concat([final_df, temp_df], ignore_index=True)
77        final_df.to_csv(output_file, index=False)
78        batch = []
79        print(f"✅ Saved {idx + 1} rows so far")
80
81print("🎉 Done. All 'Others' reclassified and saved.")

1import pandas as pd 2import subprocess 3import os 4 5# Load data 6df = pd.read_csv('classified_jobs.csv') 7df.columns = [col.strip().lower() for col in df.columns] 8 9# Categories to validate LLM output 10categories = [ 11 "Banking", "Insurance", "General Financial Services", "Healthcare", "Education", 12 "Defence", "Government - Federal", "Government - State", "Retail", 13 "Food and Beverages", "Fast Moving Consumer Goods (FMCG)", 14 "Travel and Tourism", "Airline and Aviation", "Consulting", "Technology", 15 "Telco", "Government - local", "Mining, Resources & Energy", "Others" 16] 17 18# LLM-based classification for only "Others" 19def classify_with_llm(company, location, job_title, description): 20 prompt = f""" 21You're a classification assistant. Based on the job title and description, classify the job into one of the following categories only: 22{", ".join(categories)}. 23 24Respond with only one category. Do not invent or guess new categories. 25 26Job Title: {job_title} 27Job Description: {description} 28Company Name: {company} 29Location: {location} 30""" 31 try: 32 result = subprocess.run( 33 ["ollama", "run", "gemma3:1b"], 34 input=prompt.encode("utf-8"), 35 stdout=subprocess.PIPE, 36 stderr=subprocess.PIPE, 37 timeout=80 38 ) 39 output = result.stdout.decode("utf-8").strip().split("\n")[0] 40 return output if output in categories else "Others" 41 except Exception as e: 42 print(f"⚠️ LLM failed for {job_title[:30]}: {e}") 43 return "Others" 44 45# Output file setup 46output_file = 'classified_jobs_updated_llm.csv' 47if os.path.exists(output_file): 48 final_df = pd.read_csv(output_file) 49 start_idx = len(final_df) 50else: 51 final_df = pd.DataFrame() 52 start_idx = 0 53 54# Processing loop 55batch = [] 56for idx in range(start_idx, len(df)): 57 row = df.iloc[idx] 58 current_class = row.get('classified_job', '').strip() 59 60 if current_class.lower() == "others": 61 updated_class = classify_with_llm( 62 row.get('company', ''), 63 row.get('location', ''), 64 row.get('job_title', ''), 65 row.get('description', '') 66 ) 67 else: 68 updated_class = current_class 69 70 row_data = row.to_dict() 71 row_data['updated_category'] = updated_class # Optional: store the updated classification 72 batch.append(row_data) 73 74 if (idx + 1) % 50 == 0 or (idx + 1) == len(df): 75 temp_df = pd.DataFrame(batch) 76 final_df = pd.concat([final_df, temp_df], ignore_index=True) 77 final_df.to_csv(output_file, index=False) 78 batch = [] 79 print(f"✅ Saved {idx + 1} rows so far") 80 81print("🎉 Done. All 'Others' reclassified and saved.")

Classification of Jobs which are in 'Others' category(Source code)

Next Up

Lesson 36: Plotting charts for tool extracted from dataset(Source code)

Next Up

Lesson 36: Plotting charts for tool extracted from dataset(Source code)