AusBiz Consulting

✅ AI-Powered Classification - Uses Ollama's Gemma 3:1b model to analyze job details and assign proper industry categories

✅ Resume Capability - Saves progress every 10 jobs, can restart from where it stopped if interrupted

✅ Robust Error Handling - 5 retry attempts with exponential backoff, smart category matching with fallbacks

Python

1import os
2import pandas as pd
3import time
4import requests
5import json
6import logging
7import re
8
9# Setup logging
10logging.basicConfig(
11    level=logging.INFO,
12    format='%(asctime)s - %(levelname)s - %(message)s',
13    handlers=[
14        logging.FileHandler("job_classification.log"),
15        logging.StreamHandler()
16    ]
17)
18
19INDUSTRY_CATEGORIES = [
20    "Banking Insurance General Financial Services",
21    "Healthcare",
22    "Education",
23    "Defence Government - Federal",
24    "Government - State",
25    "Retail",
26    "Food and Beverages",
27    "Fast Moving Consumer Goods (FMCG)",
28    "Travel and Tourism",
29    "Airline and Aviation",
30    "Consulting",
31    "Technology",
32    "Telco",
33    "Government - local",
34    "Mining, Resources & Energy",
35    "Others"
36]
37
38def classify_with_ollama(job_detail):
39    prompt = f"""
40    Based on the job detail below, classify it into ONE of the following categories. 
41    Return ONLY the category name and nothing else.
42
43    Categories:
44    {', '.join(INDUSTRY_CATEGORIES)}
45
46    Job detail: {job_detail}
47
48    Classification:
49    """
50    
51    max_retries = 5
52    retry_delay = 5  # start with 5 seconds
53    timeout_seconds = 80  # allow 2 minutes for large models
54    
55    for attempt in range(1, max_retries + 1):
56        try:
57            print(f"→ [Attempt {attempt}] Calling Ollama...")
58            response = requests.post(
59                "http://localhost:11434/api/generate",
60                json={"model": "gemma3:1b", "prompt": prompt, "stream": False},
61                timeout=timeout_seconds
62            )
63
64            if response.status_code == 200:
65                result = response.json()
66                classification = result.get("response", "").strip()
67                return find_best_match(classification, INDUSTRY_CATEGORIES)
68            else:
69                logging.warning(f"⚠️ Ollama responded with status {response.status_code}: {response.text}")
70
71        except requests.exceptions.RequestException as e:
72            logging.warning(f"⚠️ LLM request failed on attempt {attempt}: {str(e)}")
73
74        if attempt < max_retries:
75            print(f"⏳ Retrying in {retry_delay} seconds...")
76            time.sleep(retry_delay)
77            retry_delay *= 2  # exponential backoff
78
79    logging.error("❌ All retries failed for LLM call.")
80    return "Others"
81
82
83def find_best_match(response, categories):
84    response = response.lower().strip()
85    for category in categories:
86        if response == category.lower():
87            return category
88    for category in categories:
89        if category.lower() in response:
90            return category
91    if any(kw in response for kw in ["tech", "it ", "software", "digital", "computer"]):
92        return "Technology"
93    elif any(kw in response for kw in ["bank", "financ", "insurance", "invest"]):
94        return "Banking Insurance General Financial Services"
95    elif any(kw in response for kw in ["health", "medic", "care", "hospital"]):
96        return "Healthcare"
97    elif any(kw in response for kw in ["educat", "school", "teach", "academic"]):
98        return "Education"
99    return "Others"
100
101def normalize_industry(industry_raw):
102    """Normalize industry string for matching"""
103    industry = str(industry_raw).strip().lower()
104    industry = industry.replace('&', 'and').replace('(', '').replace(')', '')
105    industry = re.sub(r'\s+', ' ', industry)
106    return industry
107
108def process_dataframe(input_file, checkpoint_file="checkpoint.json"):
109    df = pd.read_csv(input_file)
110    print("Columns in CSV:", df.columns.tolist())
111
112    if 'industry_type' not in df.columns:
113        logging.error("Missing required column: industry_type")
114        return
115
116    job_detail_col = 'job_detail' if 'job_detail' in df.columns else 'job_details' if 'job_details' in df.columns else None
117    if not job_detail_col:
118        logging.error("Missing job_detail or job_details column.")
119        return
120
121    total_rows = len(df)
122    logging.info(f"Loaded {total_rows} rows")
123    start_idx = 0
124    processed_count = 0
125
126    if os.path.exists(checkpoint_file):
127        with open(checkpoint_file, 'r') as f:
128            checkpoint = json.load(f)
129            start_idx = checkpoint.get('next_index', 0)
130            processed_count = checkpoint.get('processed_count', 0)
131            logging.info(f"Resuming from index {start_idx}")
132
133    output_file = 'classified_jobs.csv'
134    if os.path.exists(output_file) and start_idx > 0:
135        processed_df = pd.read_csv(output_file)
136    else:
137        processed_df = pd.DataFrame(columns=df.columns.tolist() + ['classified_job'])
138
139    try:
140        for idx in range(start_idx, total_rows):
141            row = df.iloc[idx]
142            raw_industry = row['industry_type']
143            industry = normalize_industry(raw_industry)
144            job_detail = str(row[job_detail_col])
145
146            print(f"\nProcessing row {idx + 1}: raw industry_type = {raw_industry} → normalized = {industry}")
147            
148            if 'information and communication technology' in industry:
149                print("ICT detected → calling LLM...")
150                classified = classify_with_ollama(job_detail)
151            else:
152                print("Non-ICT → keeping original industry")
153                classified = raw_industry
154
155            new_row = row.to_dict()
156            new_row['classified_job'] = classified
157            processed_df = pd.concat([processed_df, pd.DataFrame([new_row])], ignore_index=True)
158            processed_count += 1
159
160            if processed_count % 10 == 0 or idx == total_rows - 1:
161                processed_df.to_csv(output_file, index=False)
162                with open(checkpoint_file, 'w') as f:
163                    json.dump({'next_index': idx + 1, 'processed_count': processed_count}, f)
164                logging.info(f"Checkpoint saved at row {idx + 1}")
165
166        print("\n✅ Processing complete. Output saved to 'classified_jobs.csv'")
167        return processed_df
168
169    except Exception as e:
170        logging.error(f"Unexpected error: {str(e)}")
171        processed_df.to_csv(output_file, index=False)
172
173if __name__ == "__main__":
174    input_file = "seek_jobs.csv"
175    if not os.path.exists(input_file):
176        print(f"❌ File not found: {input_file}")
177    else:
178        print("🚀 Starting classification...")
179        process_dataframe(input_file)

1import os 2import pandas as pd 3import time 4import requests 5import json 6import logging 7import re 8 9# Setup logging 10logging.basicConfig( 11 level=logging.INFO, 12 format='%(asctime)s - %(levelname)s - %(message)s', 13 handlers=[ 14 logging.FileHandler("job_classification.log"), 15 logging.StreamHandler() 16 ] 17) 18 19INDUSTRY_CATEGORIES = [ 20 "Banking Insurance General Financial Services", 21 "Healthcare", 22 "Education", 23 "Defence Government - Federal", 24 "Government - State", 25 "Retail", 26 "Food and Beverages", 27 "Fast Moving Consumer Goods (FMCG)", 28 "Travel and Tourism", 29 "Airline and Aviation", 30 "Consulting", 31 "Technology", 32 "Telco", 33 "Government - local", 34 "Mining, Resources & Energy", 35 "Others" 36] 37 38def classify_with_ollama(job_detail): 39 prompt = f""" 40 Based on the job detail below, classify it into ONE of the following categories. 41 Return ONLY the category name and nothing else. 42 43 Categories: 44 {', '.join(INDUSTRY_CATEGORIES)} 45 46 Job detail: {job_detail} 47 48 Classification: 49 """ 50 51 max_retries = 5 52 retry_delay = 5 # start with 5 seconds 53 timeout_seconds = 80 # allow 2 minutes for large models 54 55 for attempt in range(1, max_retries + 1): 56 try: 57 print(f"→ [Attempt {attempt}] Calling Ollama...") 58 response = requests.post( 59 "http://localhost:11434/api/generate", 60 json={"model": "gemma3:1b", "prompt": prompt, "stream": False}, 61 timeout=timeout_seconds 62 ) 63 64 if response.status_code == 200: 65 result = response.json() 66 classification = result.get("response", "").strip() 67 return find_best_match(classification, INDUSTRY_CATEGORIES) 68 else: 69 logging.warning(f"⚠️ Ollama responded with status {response.status_code}: {response.text}") 70 71 except requests.exceptions.RequestException as e: 72 logging.warning(f"⚠️ LLM request failed on attempt {attempt}: {str(e)}") 73 74 if attempt < max_retries: 75 print(f"⏳ Retrying in {retry_delay} seconds...") 76 time.sleep(retry_delay) 77 retry_delay *= 2 # exponential backoff 78 79 logging.error("❌ All retries failed for LLM call.") 80 return "Others" 81 82 83def find_best_match(response, categories): 84 response = response.lower().strip() 85 for category in categories: 86 if response == category.lower(): 87 return category 88 for category in categories: 89 if category.lower() in response: 90 return category 91 if any(kw in response for kw in ["tech", "it ", "software", "digital", "computer"]): 92 return "Technology" 93 elif any(kw in response for kw in ["bank", "financ", "insurance", "invest"]): 94 return "Banking Insurance General Financial Services" 95 elif any(kw in response for kw in ["health", "medic", "care", "hospital"]): 96 return "Healthcare" 97 elif any(kw in response for kw in ["educat", "school", "teach", "academic"]): 98 return "Education" 99 return "Others" 100 101def normalize_industry(industry_raw): 102 """Normalize industry string for matching""" 103 industry = str(industry_raw).strip().lower() 104 industry = industry.replace('&', 'and').replace('(', '').replace(')', '') 105 industry = re.sub(r'\s+', ' ', industry) 106 return industry 107 108def process_dataframe(input_file, checkpoint_file="checkpoint.json"): 109 df = pd.read_csv(input_file) 110 print("Columns in CSV:", df.columns.tolist()) 111 112 if 'industry_type' not in df.columns: 113 logging.error("Missing required column: industry_type") 114 return 115 116 job_detail_col = 'job_detail' if 'job_detail' in df.columns else 'job_details' if 'job_details' in df.columns else None 117 if not job_detail_col: 118 logging.error("Missing job_detail or job_details column.") 119 return 120 121 total_rows = len(df) 122 logging.info(f"Loaded {total_rows} rows") 123 start_idx = 0 124 processed_count = 0 125 126 if os.path.exists(checkpoint_file): 127 with open(checkpoint_file, 'r') as f: 128 checkpoint = json.load(f) 129 start_idx = checkpoint.get('next_index', 0) 130 processed_count = checkpoint.get('processed_count', 0) 131 logging.info(f"Resuming from index {start_idx}") 132 133 output_file = 'classified_jobs.csv' 134 if os.path.exists(output_file) and start_idx > 0: 135 processed_df = pd.read_csv(output_file) 136 else: 137 processed_df = pd.DataFrame(columns=df.columns.tolist() + ['classified_job']) 138 139 try: 140 for idx in range(start_idx, total_rows): 141 row = df.iloc[idx] 142 raw_industry = row['industry_type'] 143 industry = normalize_industry(raw_industry) 144 job_detail = str(row[job_detail_col]) 145 146 print(f"\nProcessing row {idx + 1}: raw industry_type = {raw_industry} → normalized = {industry}") 147 148 if 'information and communication technology' in industry: 149 print("ICT detected → calling LLM...") 150 classified = classify_with_ollama(job_detail) 151 else: 152 print("Non-ICT → keeping original industry") 153 classified = raw_industry 154 155 new_row = row.to_dict() 156 new_row['classified_job'] = classified 157 processed_df = pd.concat([processed_df, pd.DataFrame([new_row])], ignore_index=True) 158 processed_count += 1 159 160 if processed_count % 10 == 0 or idx == total_rows - 1: 161 processed_df.to_csv(output_file, index=False) 162 with open(checkpoint_file, 'w') as f: 163 json.dump({'next_index': idx + 1, 'processed_count': processed_count}, f) 164 logging.info(f"Checkpoint saved at row {idx + 1}") 165 166 print("\n✅ Processing complete. Output saved to 'classified_jobs.csv'") 167 return processed_df 168 169 except Exception as e: 170 logging.error(f"Unexpected error: {str(e)}") 171 processed_df.to_csv(output_file, index=False) 172 173if __name__ == "__main__": 174 input_file = "seek_jobs.csv" 175 if not os.path.exists(input_file): 176 print(f"❌ File not found: {input_file}") 177 else: 178 print("🚀 Starting classification...") 179 process_dataframe(input_file)

Industry Classification and saving data into csv file(Source code)

Next Up

Lesson 35: Classification of Jobs which are in 'Others' category(Source code)

Next Up

Lesson 35: Classification of Jobs which are in 'Others' category(Source code)