1import pandas as pd
2import time
3import re
4import requests
5import json
6
7def classify_sector_with_ollama(company_name, industry_type, sub_classification, description):
8 """
9 Classify company sector using Ollama with Gemma model
10 """
11
12 combined_text = f"""
13 Company Name: {company_name if not pd.isna(company_name) else ''}
14 Industry Type: {industry_type if not pd.isna(industry_type) else ''}
15 Sub-classification: {sub_classification if not pd.isna(sub_classification) else ''}
16 Description: {description if not pd.isna(description) else ''}
17 """
18
19 if pd.isna(combined_text) or combined_text.strip() == "":
20 return None
21
22 prompt = f"""
23 You are an AI model that classifies companies into Recruiter or Direct Employer, and identifies their industry.
24
25 FIRST TASK - COMPANY TYPE CLASSIFICATION:
26 Determine if the company is a Recruiter or Direct Employer. This is extremely important.
27
28 RECRUITER INDICATORS (if ANY of these are present, classify as RECRUITER):
29 - Phrases like "our client", "on behalf of", "recruiting for"
30 - Mentions of "a client in the ___ industry"
31 - Company name contains words like "Recruitment", "Talent", "Personnel", "Staffing", "People"
32 - Description mentions placing candidates with other companies
33 - Job posting focuses on what "the client" needs rather than what "we" need
34 - Description mentions multiple different positions across different companies
35
36 DIRECT EMPLOYER INDICATORS:
37 - Job description talks about "our company", "our team", "join us"
38 - No mention of clients or recruiting on behalf of others
39 - Company described as the actual place where work will be performed
40
41 SECOND TASK - INDUSTRY CLASSIFICATION:
42
43 If company is a RECRUITER:
44 - Do NOT use the recruiter's company name for industry classification
45 - Look for client industry mentions in the description
46 - If multiple clients mentioned, choose the most prominent industry
47
48 If company is a DIRECT EMPLOYER:
49 - Use both company name and job description
50 - Check if industry_type field contains specific industry information
51 - If company name contains "Consulting" and is not a recruiter, classify as "Consulting"
52
53 FORMAT your response EXACTLY as follows:
54 Company Name: [final cleaned company name]
55 Company Type: [Recruiter or Direct]
56 Inferred Industry: [single-word or hyphenated industry name]
57
58 Job Information:
59 {combined_text}
60 """
61
62 max_retries = 3
63 timeout_seconds = 45
64
65 for attempt in range(max_retries):
66 try:
67 print(f" Attempt {attempt+1}/{max_retries} for {company_name}...")
68
69 response = requests.post(
70 "http://localhost:11434/api/generate",
71 json={
72 "model": "gemma3:1b",
73 "prompt": prompt,
74 "stream": False
75 },
76 timeout=timeout_seconds
77 )
78
79 if response.status_code == 200:
80 result = response.json()["response"]
81 print(f"[✔] {company_name} → Successfully classified")
82 return result.strip()
83 else:
84 print(f"Error from Ollama API: {response.status_code} - {response.text}")
85 time.sleep(2)
86 except requests.exceptions.Timeout:
87 print(f"Timeout error on attempt {attempt+1}. Waiting before retry...")
88 time.sleep(3)
89 except Exception as e:
90 print(f"Exception when calling Ollama API: {str(e)}")
91 time.sleep(2)
92
93 print(f"All {max_retries} attempts failed for {company_name}")
94 return None
95
96
97def parse_classification_result(result_text):
98 """
99 Parse the classification result to extract structured information
100 """
101 if not result_text or pd.isna(result_text):
102 return None, None, None
103
104 company_name = None
105 company_type = None
106 industry = None
107
108
109 lines = result_text.strip().split('\n')
110 for line in lines:
111 line = line.strip()
112 if line.lower().startswith('company name:'):
113 company_name = line.split(':', 1)[1].strip()
114 elif line.lower().startswith('company type:'):
115 company_type = line.split(':', 1)[1].strip()
116 elif line.lower().startswith('inferred industry:'):
117 industry = line.split(':', 1)[1].strip()
118
119
120 if company_type:
121
122 company_type = company_type.strip()
123 if company_type.lower() == "recruiter" or "recruit" in company_type.lower() or "agency" in company_type.lower():
124 company_type = "Recruiter"
125 else:
126 company_type = "Direct"
127
128 return company_name, company_type, industry
129
130
131def detect_recruiter_in_text(text):
132 """
133 Attempt to detect if a company is a recruiter based on text analysis
134 Used as a fallback validation
135 """
136 if pd.isna(text) or not text:
137 return False
138
139 text = text.lower()
140
141
142 recruiter_phrases = [
143 "our client",
144 "on behalf of",
145 "recruiting for",
146 "recruitment agency",
147 "talent agency",
148 "staffing agency",
149 "recruitment firm",
150 "for our client",
151 "for my client",
152 "my client is seeking",
153 "our client is looking"
154 ]
155
156 recruiter_keywords = [
157 "recruitment",
158 "recruiter",
159 "talent acquisition",
160 "staffing",
161 "personnel agency",
162 "employment agency",
163 "job agency"
164 ]
165
166
167 for phrase in recruiter_phrases:
168 if phrase in text:
169 return True
170
171
172 keyword_count = sum(1 for keyword in recruiter_keywords if keyword in text)
173 if keyword_count >= 2:
174 return True
175
176 return False
177
178
179def main():
180 print("✅ Script started")
181
182
183 file_path = "seek_jobs.csv"
184
185 print(f"Reading dataset from {file_path}...")
186 df = pd.read_csv(file_path)
187
188 print(f"📄 Data loaded: {len(df)} rows")
189
190
191 company_name_column = "company_name"
192 industry_type_column = "industry_type"
193 sub_classification_column = "sub_classification"
194 description_column = "description"
195
196 print(f"Total records: {len(df)}")
197 print("Starting company classification...\n")
198
199 all_classifications = []
200
201 for idx in range(len(df)):
202 print(f"Company {idx+1}/{len(df)}:")
203
204
205 if company_name_column in df.columns:
206 company_name = df.loc[idx, company_name_column]
207 if not pd.isna(company_name) and company_name != "":
208 print(f"Company Name: {company_name}")
209 else:
210 print("Company Name: [Not specified]")
211 else:
212 print("Company Name column not found in dataset")
213
214
215 company_name = df.loc[idx, company_name_column] if company_name_column in df.columns else ""
216 industry_type = df.loc[idx, industry_type_column] if industry_type_column in df.columns else ""
217 sub_classification = df.loc[idx, sub_classification_column] if sub_classification_column in df.columns else ""
218 description = df.loc[idx, description_column] if description_column in df.columns else ""
219
220 if not pd.isna(company_name) and company_name != "":
221 print(f"Processing company: {company_name}")
222
223
224 is_likely_recruiter = False
225 recruiter_terms = ['recruit', 'talent', 'staffing', 'personnel', 'human resources', 'hr agency', 'employment']
226 if any(term in company_name.lower() for term in recruiter_terms):
227 is_likely_recruiter = True
228 print(f" Company name suggests a recruiter: {company_name}")
229
230
231 classification = classify_sector_with_ollama(company_name, industry_type, sub_classification, description)
232
233 if classification:
234 print(f"Classification result: {classification}")
235
236 parsed_company_name, company_type, industry = parse_classification_result(classification)
237 all_classifications.append(classification)
238
239
240 if is_likely_recruiter and company_type == "Direct":
241
242 if description and detect_recruiter_in_text(description):
243 print(f" Overriding company type to Recruiter based on validation")
244 company_type = "Recruiter"
245
246
247 df.loc[idx, 'extracted_company_name'] = parsed_company_name
248 df.loc[idx, 'company_type'] = company_type
249 df.loc[idx, 'industry'] = industry
250
251
252 print(f" Final classification: {company_type} - {industry}")
253 else:
254 print("No classification extracted")
255 all_classifications.append(None)
256 else:
257 print("Company name is empty for this job")
258 all_classifications.append(None)
259
260 print("-" * 50)
261
262
263 time.sleep(1.5)
264
265
266 df['company_classification'] = all_classifications
267
268
269 output_file = "seek_jobs_classified.csv"
270 df.to_csv(output_file, index=False)
271 print(f"\nResults saved to {output_file}")
272
273
274 print("\n--- Sample Results ---")
275 sample_results = df[[company_name_column, 'company_type', 'industry']].head(10)
276
277
278 for idx, row in sample_results.iterrows():
279 print(f"\n{row[company_name_column]}:")
280 print(f" Type: {row['company_type'] if not pd.isna(row['company_type']) else 'Unknown'}")
281 print(f" Industry: {row['industry'] if not pd.isna(row['industry']) else 'Unknown'}")
282
283
284 company_type_counts = df['company_type'].value_counts()
285 print("\n--- Company Type Distribution ---")
286 for company_type, count in company_type_counts.items():
287 if pd.notna(company_type):
288 print(f"{company_type}: {count}")
289
290
291 industry_counts = df['industry'].value_counts().head(10)
292 print("\n--- Top Industries ---")
293 for industry, count in industry_counts.items():
294 if pd.notna(industry):
295 print(f"{industry}: {count}")
296
297
298 missing_count = df['industry'].isna().sum()
299 total_count = len(df)
300 print(f"\nClassification success rate: {(total_count - missing_count) / total_count:.1%} ({total_count - missing_count}/{total_count})")
301
302
303if __name__ == "__main__":
304 main()