AusBiz Consulting

industry_classify.pyPython

1import pandas as pd
2import time
3import re
4import requests
5import json
6
7def classify_sector_with_ollama(company_name, industry_type, sub_classification, description):
8    """
9    Classify company sector using Ollama with Gemma model
10    """
11    
12    combined_text = f"""
13    Company Name: {company_name if not pd.isna(company_name) else ''}
14    Industry Type: {industry_type if not pd.isna(industry_type) else ''}
15    Sub-classification: {sub_classification if not pd.isna(sub_classification) else ''}
16    Description: {description if not pd.isna(description) else ''}
17    """
18    
19    if pd.isna(combined_text) or combined_text.strip() == "":
20        return None
21
22    prompt = f"""
23    You are an AI model that classifies companies into Recruiter or Direct Employer, and identifies their industry.
24    
25    FIRST TASK - COMPANY TYPE CLASSIFICATION:
26    Determine if the company is a Recruiter or Direct Employer. This is extremely important.
27    
28    RECRUITER INDICATORS (if ANY of these are present, classify as RECRUITER):
29    - Phrases like "our client", "on behalf of", "recruiting for"
30    - Mentions of "a client in the ___ industry"
31    - Company name contains words like "Recruitment", "Talent", "Personnel", "Staffing", "People"
32    - Description mentions placing candidates with other companies
33    - Job posting focuses on what "the client" needs rather than what "we" need
34    - Description mentions multiple different positions across different companies
35    
36    DIRECT EMPLOYER INDICATORS:
37    - Job description talks about "our company", "our team", "join us"
38    - No mention of clients or recruiting on behalf of others
39    - Company described as the actual place where work will be performed
40    
41    SECOND TASK - INDUSTRY CLASSIFICATION:
42    
43    If company is a RECRUITER:
44    - Do NOT use the recruiter's company name for industry classification
45    - Look for client industry mentions in the description
46    - If multiple clients mentioned, choose the most prominent industry
47    
48    If company is a DIRECT EMPLOYER:
49    - Use both company name and job description
50    - Check if industry_type field contains specific industry information
51    - If company name contains "Consulting" and is not a recruiter, classify as "Consulting"
52    
53    FORMAT your response EXACTLY as follows:
54    Company Name: [final cleaned company name]
55    Company Type: [Recruiter or Direct]
56    Inferred Industry: [single-word or hyphenated industry name]
57
58    Job Information:
59    {combined_text}
60    """
61
62    max_retries = 3
63    timeout_seconds = 45
64
65    for attempt in range(max_retries):
66        try:
67            print(f"  Attempt {attempt+1}/{max_retries} for {company_name}...")
68            
69            response = requests.post(
70                "http://localhost:11434/api/generate",
71                json={
72                    "model": "gemma3:1b",
73                    "prompt": prompt,
74                    "stream": False
75                },
76                timeout=timeout_seconds
77            )
78
79            if response.status_code == 200:
80                result = response.json()["response"]
81                print(f"[✔] {company_name} → Successfully classified")
82                return result.strip()
83            else:
84                print(f"Error from Ollama API: {response.status_code} - {response.text}")
85                time.sleep(2)
86        except requests.exceptions.Timeout:
87            print(f"Timeout error on attempt {attempt+1}. Waiting before retry...")
88            time.sleep(3)
89        except Exception as e:
90            print(f"Exception when calling Ollama API: {str(e)}")
91            time.sleep(2)
92    
93    print(f"All {max_retries} attempts failed for {company_name}")
94    return None
95
96
97def parse_classification_result(result_text):
98    """
99    Parse the classification result to extract structured information
100    """
101    if not result_text or pd.isna(result_text):
102        return None, None, None
103        
104    company_name = None
105    company_type = None
106    industry = None
107    
108    # Try to find structured information
109    lines = result_text.strip().split('\n')
110    for line in lines:
111        line = line.strip()
112        if line.lower().startswith('company name:'):
113            company_name = line.split(':', 1)[1].strip()
114        elif line.lower().startswith('company type:'):
115            company_type = line.split(':', 1)[1].strip()
116        elif line.lower().startswith('inferred industry:'):
117            industry = line.split(':', 1)[1].strip()
118    
119    # Perform additional validation on company_type
120    if company_type:
121        # Normalize company_type to either "Recruiter" or "Direct"
122        company_type = company_type.strip()
123        if company_type.lower() == "recruiter" or "recruit" in company_type.lower() or "agency" in company_type.lower():
124            company_type = "Recruiter"
125        else:
126            company_type = "Direct"
127    
128    return company_name, company_type, industry
129
130
131def detect_recruiter_in_text(text):
132    """
133    Attempt to detect if a company is a recruiter based on text analysis
134    Used as a fallback validation
135    """
136    if pd.isna(text) or not text:
137        return False
138        
139    text = text.lower()
140    
141    # Strong recruitment indicators
142    recruiter_phrases = [
143        "our client", 
144        "on behalf of", 
145        "recruiting for",
146        "recruitment agency",
147        "talent agency",
148        "staffing agency",
149        "recruitment firm",
150        "for our client",
151        "for my client",
152        "my client is seeking",
153        "our client is looking"
154    ]
155    
156    recruiter_keywords = [
157        "recruitment", 
158        "recruiter", 
159        "talent acquisition", 
160        "staffing",
161        "personnel agency",
162        "employment agency",
163        "job agency"
164    ]
165    
166    # Check for recruitment phrases
167    for phrase in recruiter_phrases:
168        if phrase in text:
169            return True
170    
171    # Count recruitment keywords
172    keyword_count = sum(1 for keyword in recruiter_keywords if keyword in text)
173    if keyword_count >= 2:  # If multiple keywords are found, likely a recruiter
174        return True
175        
176    return False
177
178
179def main():
180    print("✅ Script started")
181    
182    # Fix Windows file path if needed
183    file_path = "seek_jobs.csv"
184    
185    print(f"Reading dataset from {file_path}...")
186    df = pd.read_csv(file_path)
187    
188    print(f"📄 Data loaded: {len(df)} rows")
189    
190    # Columns to use for classification
191    company_name_column = "company_name"
192    industry_type_column = "industry_type"
193    sub_classification_column = "sub_classification"  
194    description_column = "description"
195    
196    print(f"Total records: {len(df)}")
197    print("Starting company classification...\n")
198    
199    all_classifications = []
200    
201    for idx in range(len(df)):
202        print(f"Company {idx+1}/{len(df)}:")
203        
204        # Print company name first
205        if company_name_column in df.columns:
206            company_name = df.loc[idx, company_name_column]
207            if not pd.isna(company_name) and company_name != "":
208                print(f"Company Name: {company_name}")
209            else:
210                print("Company Name: [Not specified]")
211        else:
212            print("Company Name column not found in dataset")
213        
214        # Get information for this company/job
215        company_name = df.loc[idx, company_name_column] if company_name_column in df.columns else ""
216        industry_type = df.loc[idx, industry_type_column] if industry_type_column in df.columns else ""
217        sub_classification = df.loc[idx, sub_classification_column] if sub_classification_column in df.columns else ""
218        description = df.loc[idx, description_column] if description_column in df.columns else ""
219        
220        if not pd.isna(company_name) and company_name != "":
221            print(f"Processing company: {company_name}")
222            
223            # First, try a heuristic approach to detect recruiters in company name
224            is_likely_recruiter = False
225            recruiter_terms = ['recruit', 'talent', 'staffing', 'personnel', 'human resources', 'hr agency', 'employment']
226            if any(term in company_name.lower() for term in recruiter_terms):
227                is_likely_recruiter = True
228                print(f"  Company name suggests a recruiter: {company_name}")
229            
230            # Get classification from Ollama
231            classification = classify_sector_with_ollama(company_name, industry_type, sub_classification, description)
232            
233            if classification:
234                print(f"Classification result: {classification}")
235                # Parse the classification results
236                parsed_company_name, company_type, industry = parse_classification_result(classification)
237                all_classifications.append(classification)
238                
239                # Additional validation for company type
240                if is_likely_recruiter and company_type == "Direct":
241                    # If company name suggests recruiter but model says Direct, do an additional check
242                    if description and detect_recruiter_in_text(description):
243                        print(f"  Overriding company type to Recruiter based on validation")
244                        company_type = "Recruiter"
245                
246                # Store the parsed results in the dataframe
247                df.loc[idx, 'extracted_company_name'] = parsed_company_name
248                df.loc[idx, 'company_type'] = company_type
249                df.loc[idx, 'industry'] = industry
250                
251                # Print the final decision
252                print(f"  Final classification: {company_type} - {industry}")
253            else:
254                print("No classification extracted")
255                all_classifications.append(None)
256        else:
257            print("Company name is empty for this job")
258            all_classifications.append(None)
259        
260        print("-" * 50)
261        
262        # Add a small delay between companies to avoid overwhelming the Ollama API
263        time.sleep(1.5)
264    
265    # Add raw classifications to dataframe
266    df['company_classification'] = all_classifications
267    
268    # Save the results to CSV
269    output_file = "seek_jobs_classified.csv"
270    df.to_csv(output_file, index=False)
271    print(f"\nResults saved to {output_file}")
272    
273    # Display results
274    print("\n--- Sample Results ---")
275    sample_results = df[[company_name_column, 'company_type', 'industry']].head(10)
276    
277    # Format nicely for display
278    for idx, row in sample_results.iterrows():
279        print(f"\n{row[company_name_column]}:")
280        print(f"  Type: {row['company_type'] if not pd.isna(row['company_type']) else 'Unknown'}")
281        print(f"  Industry: {row['industry'] if not pd.isna(row['industry']) else 'Unknown'}")
282    
283    # Count company types
284    company_type_counts = df['company_type'].value_counts()
285    print("\n--- Company Type Distribution ---")
286    for company_type, count in company_type_counts.items():
287        if pd.notna(company_type):
288            print(f"{company_type}: {count}")
289    
290    # Count industries
291    industry_counts = df['industry'].value_counts().head(10)
292    print("\n--- Top Industries ---")
293    for industry, count in industry_counts.items():
294        if pd.notna(industry):
295            print(f"{industry}: {count}")
296    
297    # Check for missing classifications
298    missing_count = df['industry'].isna().sum()
299    total_count = len(df)
300    print(f"\nClassification success rate: {(total_count - missing_count) / total_count:.1%} ({total_count - missing_count}/{total_count})")
301    
302
303if __name__ == "__main__":
304    main()

1import pandas as pd 2import time 3import re 4import requests 5import json 6 7def classify_sector_with_ollama(company_name, industry_type, sub_classification, description): 8 """ 9 Classify company sector using Ollama with Gemma model 10 """ 11 12 combined_text = f""" 13 Company Name: {company_name if not pd.isna(company_name) else ''} 14 Industry Type: {industry_type if not pd.isna(industry_type) else ''} 15 Sub-classification: {sub_classification if not pd.isna(sub_classification) else ''} 16 Description: {description if not pd.isna(description) else ''} 17 """ 18 19 if pd.isna(combined_text) or combined_text.strip() == "": 20 return None 21 22 prompt = f""" 23 You are an AI model that classifies companies into Recruiter or Direct Employer, and identifies their industry. 24 25 FIRST TASK - COMPANY TYPE CLASSIFICATION: 26 Determine if the company is a Recruiter or Direct Employer. This is extremely important. 27 28 RECRUITER INDICATORS (if ANY of these are present, classify as RECRUITER): 29 - Phrases like "our client", "on behalf of", "recruiting for" 30 - Mentions of "a client in the ___ industry" 31 - Company name contains words like "Recruitment", "Talent", "Personnel", "Staffing", "People" 32 - Description mentions placing candidates with other companies 33 - Job posting focuses on what "the client" needs rather than what "we" need 34 - Description mentions multiple different positions across different companies 35 36 DIRECT EMPLOYER INDICATORS: 37 - Job description talks about "our company", "our team", "join us" 38 - No mention of clients or recruiting on behalf of others 39 - Company described as the actual place where work will be performed 40 41 SECOND TASK - INDUSTRY CLASSIFICATION: 42 43 If company is a RECRUITER: 44 - Do NOT use the recruiter's company name for industry classification 45 - Look for client industry mentions in the description 46 - If multiple clients mentioned, choose the most prominent industry 47 48 If company is a DIRECT EMPLOYER: 49 - Use both company name and job description 50 - Check if industry_type field contains specific industry information 51 - If company name contains "Consulting" and is not a recruiter, classify as "Consulting" 52 53 FORMAT your response EXACTLY as follows: 54 Company Name: [final cleaned company name] 55 Company Type: [Recruiter or Direct] 56 Inferred Industry: [single-word or hyphenated industry name] 57 58 Job Information: 59 {combined_text} 60 """ 61 62 max_retries = 3 63 timeout_seconds = 45 64 65 for attempt in range(max_retries): 66 try: 67 print(f" Attempt {attempt+1}/{max_retries} for {company_name}...") 68 69 response = requests.post( 70 "http://localhost:11434/api/generate", 71 json={ 72 "model": "gemma3:1b", 73 "prompt": prompt, 74 "stream": False 75 }, 76 timeout=timeout_seconds 77 ) 78 79 if response.status_code == 200: 80 result = response.json()["response"] 81 print(f"[✔] {company_name} → Successfully classified") 82 return result.strip() 83 else: 84 print(f"Error from Ollama API: {response.status_code} - {response.text}") 85 time.sleep(2) 86 except requests.exceptions.Timeout: 87 print(f"Timeout error on attempt {attempt+1}. Waiting before retry...") 88 time.sleep(3) 89 except Exception as e: 90 print(f"Exception when calling Ollama API: {str(e)}") 91 time.sleep(2) 92 93 print(f"All {max_retries} attempts failed for {company_name}") 94 return None 95 96 97def parse_classification_result(result_text): 98 """ 99 Parse the classification result to extract structured information 100 """ 101 if not result_text or pd.isna(result_text): 102 return None, None, None 103 104 company_name = None 105 company_type = None 106 industry = None 107 108 # Try to find structured information 109 lines = result_text.strip().split('\n') 110 for line in lines: 111 line = line.strip() 112 if line.lower().startswith('company name:'): 113 company_name = line.split(':', 1)[1].strip() 114 elif line.lower().startswith('company type:'): 115 company_type = line.split(':', 1)[1].strip() 116 elif line.lower().startswith('inferred industry:'): 117 industry = line.split(':', 1)[1].strip() 118 119 # Perform additional validation on company_type 120 if company_type: 121 # Normalize company_type to either "Recruiter" or "Direct" 122 company_type = company_type.strip() 123 if company_type.lower() == "recruiter" or "recruit" in company_type.lower() or "agency" in company_type.lower(): 124 company_type = "Recruiter" 125 else: 126 company_type = "Direct" 127 128 return company_name, company_type, industry 129 130 131def detect_recruiter_in_text(text): 132 """ 133 Attempt to detect if a company is a recruiter based on text analysis 134 Used as a fallback validation 135 """ 136 if pd.isna(text) or not text: 137 return False 138 139 text = text.lower() 140 141 # Strong recruitment indicators 142 recruiter_phrases = [ 143 "our client", 144 "on behalf of", 145 "recruiting for", 146 "recruitment agency", 147 "talent agency", 148 "staffing agency", 149 "recruitment firm", 150 "for our client", 151 "for my client", 152 "my client is seeking", 153 "our client is looking" 154 ] 155 156 recruiter_keywords = [ 157 "recruitment", 158 "recruiter", 159 "talent acquisition", 160 "staffing", 161 "personnel agency", 162 "employment agency", 163 "job agency" 164 ] 165 166 # Check for recruitment phrases 167 for phrase in recruiter_phrases: 168 if phrase in text: 169 return True 170 171 # Count recruitment keywords 172 keyword_count = sum(1 for keyword in recruiter_keywords if keyword in text) 173 if keyword_count >= 2: # If multiple keywords are found, likely a recruiter 174 return True 175 176 return False 177 178 179def main(): 180 print("✅ Script started") 181 182 # Fix Windows file path if needed 183 file_path = "seek_jobs.csv" 184 185 print(f"Reading dataset from {file_path}...") 186 df = pd.read_csv(file_path) 187 188 print(f"📄 Data loaded: {len(df)} rows") 189 190 # Columns to use for classification 191 company_name_column = "company_name" 192 industry_type_column = "industry_type" 193 sub_classification_column = "sub_classification" 194 description_column = "description" 195 196 print(f"Total records: {len(df)}") 197 print("Starting company classification...\n") 198 199 all_classifications = [] 200 201 for idx in range(len(df)): 202 print(f"Company {idx+1}/{len(df)}:") 203 204 # Print company name first 205 if company_name_column in df.columns: 206 company_name = df.loc[idx, company_name_column] 207 if not pd.isna(company_name) and company_name != "": 208 print(f"Company Name: {company_name}") 209 else: 210 print("Company Name: [Not specified]") 211 else: 212 print("Company Name column not found in dataset") 213 214 # Get information for this company/job 215 company_name = df.loc[idx, company_name_column] if company_name_column in df.columns else "" 216 industry_type = df.loc[idx, industry_type_column] if industry_type_column in df.columns else "" 217 sub_classification = df.loc[idx, sub_classification_column] if sub_classification_column in df.columns else "" 218 description = df.loc[idx, description_column] if description_column in df.columns else "" 219 220 if not pd.isna(company_name) and company_name != "": 221 print(f"Processing company: {company_name}") 222 223 # First, try a heuristic approach to detect recruiters in company name 224 is_likely_recruiter = False 225 recruiter_terms = ['recruit', 'talent', 'staffing', 'personnel', 'human resources', 'hr agency', 'employment'] 226 if any(term in company_name.lower() for term in recruiter_terms): 227 is_likely_recruiter = True 228 print(f" Company name suggests a recruiter: {company_name}") 229 230 # Get classification from Ollama 231 classification = classify_sector_with_ollama(company_name, industry_type, sub_classification, description) 232 233 if classification: 234 print(f"Classification result: {classification}") 235 # Parse the classification results 236 parsed_company_name, company_type, industry = parse_classification_result(classification) 237 all_classifications.append(classification) 238 239 # Additional validation for company type 240 if is_likely_recruiter and company_type == "Direct": 241 # If company name suggests recruiter but model says Direct, do an additional check 242 if description and detect_recruiter_in_text(description): 243 print(f" Overriding company type to Recruiter based on validation") 244 company_type = "Recruiter" 245 246 # Store the parsed results in the dataframe 247 df.loc[idx, 'extracted_company_name'] = parsed_company_name 248 df.loc[idx, 'company_type'] = company_type 249 df.loc[idx, 'industry'] = industry 250 251 # Print the final decision 252 print(f" Final classification: {company_type} - {industry}") 253 else: 254 print("No classification extracted") 255 all_classifications.append(None) 256 else: 257 print("Company name is empty for this job") 258 all_classifications.append(None) 259 260 print("-" * 50) 261 262 # Add a small delay between companies to avoid overwhelming the Ollama API 263 time.sleep(1.5) 264 265 # Add raw classifications to dataframe 266 df['company_classification'] = all_classifications 267 268 # Save the results to CSV 269 output_file = "seek_jobs_classified.csv" 270 df.to_csv(output_file, index=False) 271 print(f"\nResults saved to {output_file}") 272 273 # Display results 274 print("\n--- Sample Results ---") 275 sample_results = df[[company_name_column, 'company_type', 'industry']].head(10) 276 277 # Format nicely for display 278 for idx, row in sample_results.iterrows(): 279 print(f"\n{row[company_name_column]}:") 280 print(f" Type: {row['company_type'] if not pd.isna(row['company_type']) else 'Unknown'}") 281 print(f" Industry: {row['industry'] if not pd.isna(row['industry']) else 'Unknown'}") 282 283 # Count company types 284 company_type_counts = df['company_type'].value_counts() 285 print("\n--- Company Type Distribution ---") 286 for company_type, count in company_type_counts.items(): 287 if pd.notna(company_type): 288 print(f"{company_type}: {count}") 289 290 # Count industries 291 industry_counts = df['industry'].value_counts().head(10) 292 print("\n--- Top Industries ---") 293 for industry, count in industry_counts.items(): 294 if pd.notna(industry): 295 print(f"{industry}: {count}") 296 297 # Check for missing classifications 298 missing_count = df['industry'].isna().sum() 299 total_count = len(df) 300 print(f"\nClassification success rate: {(total_count - missing_count) / total_count:.1%} ({total_count - missing_count}/{total_count})") 301 302 303if __name__ == "__main__": 304 main()

Industry Classification from dataset (Source code)

Next Up

Lesson 30: Tool extraction and saving data into csv file(Source code)

Next Up

Lesson 30: Tool extraction and saving data into csv file(Source code)