✅ AI-Powered Classification - Uses Ollama's Gemma 3:1b model to analyze job details and assign proper industry categories
✅ Resume Capability - Saves progress every 10 jobs, can restart from where it stopped if interrupted
✅ Robust Error Handling - 5 retry attempts with exponential backoff, smart category matching with fallbacks
1import os2import pandas as pd3import time4import requests5import json6import logging7import re8
9# Setup logging10logging.basicConfig(11 level=logging.INFO,12 format='%(asctime)s - %(levelname)s - %(message)s',13 handlers=[14 logging.FileHandler("job_classification.log"),15 logging.StreamHandler()16 ]17)18
19INDUSTRY_CATEGORIES = [20 "Banking Insurance General Financial Services",21 "Healthcare",22 "Education",23 "Defence Government - Federal",24 "Government - State",25 "Retail",26 "Food and Beverages",27 "Fast Moving Consumer Goods (FMCG)",28 "Travel and Tourism",29 "Airline and Aviation",30 "Consulting",31 "Technology",32 "Telco",33 "Government - local",34 "Mining, Resources & Energy",35 "Others"36]37
38def classify_with_ollama(job_detail):39 prompt = f"""40 Based on the job detail below, classify it into ONE of the following categories. 41 Return ONLY the category name and nothing else.42
43 Categories:44 {', '.join(INDUSTRY_CATEGORIES)}45
46 Job detail: {job_detail}47
48 Classification:49 """50 51 max_retries = 552 retry_delay = 5 # start with 5 seconds53 timeout_seconds = 80 # allow 2 minutes for large models54 55 for attempt in range(1, max_retries + 1):56 try:57 print(f"→ [Attempt {attempt}] Calling Ollama...")58 response = requests.post(59 "http://localhost:11434/api/generate",60 json={"model": "gemma3:1b", "prompt": prompt, "stream": False},61 timeout=timeout_seconds62 )63
64 if response.status_code == 200:65 result = response.json()66 classification = result.get("response", "").strip()67 return find_best_match(classification, INDUSTRY_CATEGORIES)68 else:69 logging.warning(f"⚠️ Ollama responded with status {response.status_code}: {response.text}")70
71 except requests.exceptions.RequestException as e:72 logging.warning(f"⚠️ LLM request failed on attempt {attempt}: {str(e)}")73
74 if attempt < max_retries:75 print(f"⏳ Retrying in {retry_delay} seconds...")76 time.sleep(retry_delay)77 retry_delay *= 2 # exponential backoff78
79 logging.error("❌ All retries failed for LLM call.")80 return "Others"81
82
83def find_best_match(response, categories):84 response = response.lower().strip()85 for category in categories:86 if response == category.lower():87 return category88 for category in categories:89 if category.lower() in response:90 return category91 if any(kw in response for kw in ["tech", "it ", "software", "digital", "computer"]):92 return "Technology"93 elif any(kw in response for kw in ["bank", "financ", "insurance", "invest"]):94 return "Banking Insurance General Financial Services"95 elif any(kw in response for kw in ["health", "medic", "care", "hospital"]):96 return "Healthcare"97 elif any(kw in response for kw in ["educat", "school", "teach", "academic"]):98 return "Education"99 return "Others"100
101def normalize_industry(industry_raw):102 """Normalize industry string for matching"""103 industry = str(industry_raw).strip().lower()104 industry = industry.replace('&', 'and').replace('(', '').replace(')', '')105 industry = re.sub(r'\s+', ' ', industry)106 return industry107
108def process_dataframe(input_file, checkpoint_file="checkpoint.json"):109 df = pd.read_csv(input_file)110 print("Columns in CSV:", df.columns.tolist())111
112 if 'industry_type' not in df.columns:113 logging.error("Missing required column: industry_type")114 return115
116 job_detail_col = 'job_detail' if 'job_detail' in df.columns else 'job_details' if 'job_details' in df.columns else None117 if not job_detail_col:118 logging.error("Missing job_detail or job_details column.")119 return120
121 total_rows = len(df)122 logging.info(f"Loaded {total_rows} rows")123 start_idx = 0124 processed_count = 0125
126 if os.path.exists(checkpoint_file):127 with open(checkpoint_file, 'r') as f:128 checkpoint = json.load(f)129 start_idx = checkpoint.get('next_index', 0)130 processed_count = checkpoint.get('processed_count', 0)131 logging.info(f"Resuming from index {start_idx}")132
133 output_file = 'classified_jobs.csv'134 if os.path.exists(output_file) and start_idx > 0:135 processed_df = pd.read_csv(output_file)136 else:137 processed_df = pd.DataFrame(columns=df.columns.tolist() + ['classified_job'])138
139 try:140 for idx in range(start_idx, total_rows):141 row = df.iloc[idx]142 raw_industry = row['industry_type']143 industry = normalize_industry(raw_industry)144 job_detail = str(row[job_detail_col])145
146 print(f"\nProcessing row {idx + 1}: raw industry_type = {raw_industry} → normalized = {industry}")147 148 if 'information and communication technology' in industry:149 print("ICT detected → calling LLM...")150 classified = classify_with_ollama(job_detail)151 else:152 print("Non-ICT → keeping original industry")153 classified = raw_industry154
155 new_row = row.to_dict()156 new_row['classified_job'] = classified157 processed_df = pd.concat([processed_df, pd.DataFrame([new_row])], ignore_index=True)158 processed_count += 1159
160 if processed_count % 10 == 0 or idx == total_rows - 1:161 processed_df.to_csv(output_file, index=False)162 with open(checkpoint_file, 'w') as f:163 json.dump({'next_index': idx + 1, 'processed_count': processed_count}, f)164 logging.info(f"Checkpoint saved at row {idx + 1}")165
166 print("\n✅ Processing complete. Output saved to 'classified_jobs.csv'")167 return processed_df168
169 except Exception as e:170 logging.error(f"Unexpected error: {str(e)}")171 processed_df.to_csv(output_file, index=False)172
173if __name__ == "__main__":174 input_file = "seek_jobs.csv"175 if not os.path.exists(input_file):176 print(f"❌ File not found: {input_file}")177 else:178 print("🚀 Starting classification...")179 process_dataframe(input_file)