✅ Compared to previous lesson no. 26, this script improves on soft skill extraction by verifying results, saving it in seek_jobs_with_skills.csv and handling AI/API failures gracefully.
✅ Extracts soft skills from job descriptions using Ollama's Gemma model, with strict filtering and synonym matching.
✅ Implements retry logic, timeout handling, and fallbacks to keyword-based extraction if AI fails.
✅ Checks for presence of extracted soft skills (or their variations) in the original job text to ensure accuracy.
✅ Uses checkpointing to resume progress and prevent data loss during large batch processing.
✅ Automatically saves progress to a checkpoint file after every 50 job descriptions processed to ensure intermediate results are not lost.
1import json2import pandas as pd3import requests4import time5import random6from requests.exceptions import Timeout, ConnectionError7
8def extract_soft_skills_with_ollama(job_description, max_retries=3, timeout=60):9 """10 Extract soft skills from job description using Ollama with Gemma model11 with improved error handling and retries12 """13 14 if pd.isna(job_description) or job_description == "":15 return []16 17 job_description_lower = job_description.lower()18 19 prompt = f"""20 You are a specialized soft skills extractor for job descriptions. Your task is to identify ONLY soft skills, interpersonal abilities, and personal attributes mentioned in the job description.21
22 STRICT RULES:23 1. Extract ONLY soft skills and interpersonal abilities24 2. Include ONLY: communication skills, leadership qualities, interpersonal traits, personal attributes25 3. Do NOT include: technical skills, tools, programming languages, or job requirements26 4. Do NOT include job titles, responsibilities, or education requirements27 5. IMPORTANT: ONLY extract soft skills that are EXPLICITLY mentioned in the text28 6. If no soft skills are mentioned, return an empty array []29
30 CORRECT EXAMPLES:31 - INCLUDE: communication, teamwork, leadership, problem-solving, adaptability, time management, creativity, critical thinking32 - DO NOT INCLUDE: Python, project management (as a role), BA degree, Excel, analytics (as a field)33
34 Job Description:35 {job_description}36
37 Respond ONLY with a JSON array containing the list of soft skills found.38 Example response format: ["Communication", "Teamwork", "Leadership"]39 """40 41 # Make request to local Ollama API with retry logic42 for attempt in range(max_retries):43 try:44 print(f"API call attempt {attempt + 1}/{max_retries}")45 response = requests.post(46 "http://localhost:11434/api/generate",47 json={48 "model": "gemma3:1b",49 "prompt": prompt,50 "stream": False51 },52 timeout=timeout # Increased timeout53 )54 55 if response.status_code == 200:56 result = response.json()["response"]57 58 # Extract JSON array from the response59 try:60 # Try to find JSON array in the response61 start_idx = result.find('[')62 end_idx = result.rfind(']') + 163 64 if start_idx >= 0 and end_idx > start_idx:65 json_str = result[start_idx:end_idx]66 soft_skills = json.loads(json_str)67 68 # Additional filtering to remove non-soft skill items69 non_soft_skills = ["python", "java", "excel", "sql", "aws", "azure", "tableau", 70 "powerbi", "data", "analytics", "database", "programming",71 "degree", "bachelor", "master", "certification", "license"]72 73 # Filter out any items that are likely not soft skills74 filtered_skills = [skill for skill in soft_skills 75 if skill.lower() not in [ns.lower() for ns in non_soft_skills]76 and len(skill) > 1] # Avoid single characters77 78 # CRITICAL: Verify each soft skill actually appears in the job description79 verified_skills = []80 81 # Common variations and synonyms for soft skills82 skill_synonyms = {83 "communication": ["communicate", "verbal", "written communication", "articulate", "presenting"],84 "leadership": ["lead", "leading", "leader", "motivate", "influence"],85 "teamwork": ["team player", "collaborate", "collaboration", "team-oriented", "team work"],86 "adaptability": ["adapt", "flexible", "flexibility", "versatile", "versatility"],87 "problem-solving": ["solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"],88 "time management": ["prioritize", "prioritization", "deadlines", "punctual", "time-conscious"],89 "creativity": ["creative", "innovative", "innovation", "creative thinking", "think outside the box"],90 "critical thinking": ["analytical", "analyze", "critical", "logical", "reasoning"]91 }92 93 for skill in filtered_skills:94 skill_lower = skill.lower()95 96 # Check if the skill directly appears in the job description97 if skill_lower in job_description_lower:98 verified_skills.append(skill)99 continue100 101 # Check for synonyms and variations102 for main_skill, synonyms in skill_synonyms.items():103 if skill_lower == main_skill or skill_lower in synonyms:104 # Check if any synonyms appear in the text105 if any(syn in job_description_lower for syn in synonyms) or main_skill in job_description_lower:106 verified_skills.append(main_skill.title()) # Use standardized version107 break108 109 print(f"Initial extraction: {soft_skills}")110 print(f"After verification: {verified_skills}")111 return verified_skills112 else:113 print(f"Couldn't find JSON array in: {result}")114 return []115 except json.JSONDecodeError:116 print(f"Failed to parse JSON from: {result}")117 return []118 else:119 print(f"Error from Ollama API: {response.status_code} - {response.text}")120 # If it's a server error, retry121 if response.status_code >= 500:122 if attempt < max_retries - 1:123 # Add exponential backoff with jitter124 backoff_time = (2 ** attempt) + random.uniform(0, 1)125 print(f"Retrying in {backoff_time:.2f} seconds...")126 time.sleep(backoff_time)127 continue128 return []129 except (Timeout, ConnectionError) as e:130 print(f"Connection error on attempt {attempt + 1}: {str(e)}")131 if attempt < max_retries - 1:132 # Add exponential backoff with jitter133 backoff_time = (2 ** attempt) + random.uniform(0, 1)134 print(f"Retrying in {backoff_time:.2f} seconds...")135 time.sleep(backoff_time)136 else:137 print(f"Max retries exceeded. Moving on without skills extraction.")138 return []139 except Exception as e:140 print(f"Unexpected exception: {str(e)}")141 return []142 143 return []144
145def fallback_extract_soft_skills(job_description):146 """147 Simple keyword-based soft skills extraction as fallback when Ollama fails148 """149 if pd.isna(job_description) or job_description == "":150 return []151 152 # List of common soft skills to look for153 common_soft_skills = {154 "Communication": ["communication", "communicate", "verbal", "written", "articulate", "presenting", "presentation"],155 "Teamwork": ["teamwork", "team player", "collaborate", "collaboration", "team-oriented", "team work"],156 "Leadership": ["leadership", "lead", "leading", "leader", "motivate", "influence"],157 "Problem-solving": ["problem-solving", "solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"],158 "Adaptability": ["adaptability", "adapt", "flexible", "flexibility", "versatile", "versatility"],159 "Time Management": ["time management", "prioritize", "prioritization", "deadlines", "punctual", "time-conscious"],160 "Creativity": ["creativity", "creative", "innovative", "innovation", "creative thinking", "think outside the box"],161 "Critical Thinking": ["critical thinking", "analytical", "analyze", "critical", "logical", "reasoning"],162 "Attention to Detail": ["attention to detail", "detail-oriented", "meticulous", "thorough", "precise", "accuracy"],163 "Interpersonal Skills": ["interpersonal", "people skills", "rapport", "relationship building"],164 "Organizational Skills": ["organizational", "organized", "organization", "structure", "systematic"],165 "Self-motivation": ["self-motivated", "self-starter", "initiative", "proactive", "driven", "ambitious"],166 "Conflict Resolution": ["conflict resolution", "mediation", "dispute", "negotiation", "diplomacy"],167 "Decision Making": ["decision making", "decisive", "judgment", "discernment"],168 "Emotional Intelligence": ["emotional intelligence", "empathy", "self-awareness", "social awareness"],169 "Resilience": ["resilience", "resilient", "perseverance", "tenacity", "determination"],170 "Customer Service": ["customer service", "client-focused", "customer-oriented", "service-minded"]171 }172 173 job_description_lower = job_description.lower()174 found_skills = []175 176 for skill, keywords in common_soft_skills.items():177 for keyword in keywords:178 if keyword.lower() in job_description_lower:179 found_skills.append(skill)180 break # Only add the skill once if any keyword is found181 182 return found_skills183
184def main():185 print("✅ Script started")186
187 file_path = "seek_jobs.csv"188 output_path = "seek_jobs_with_skills.csv"189 log_path = "skills_extraction_log.txt"190 191 # Setup logging to file192 import sys193 original_stdout = sys.stdout194 log_file = open(log_path, 'w')195 196 try:197 print(f"Reading dataset from {file_path}...")198 df = pd.read_csv(file_path)199 print(f"📄 Data loaded: {len(df)} rows")200
201 # Columns to extract soft skills from202 title_column = "job_title" # Column containing job titles203 description_column = "description"204 job_details_column = "job_details"205 job_type_column = "job_type"206
207 print(f"Total records: {len(df)}")208 print("Starting soft skills extraction...\n")209
210 all_soft_skills = []211 extracted_soft_skills_str = []212 used_fallback = []213
214 processing_range = df215
216 # Create a checkpoint system to save progress periodically217 checkpoint_interval = 50218 last_checkpoint = 0219 checkpoint_file = "skills_extraction_checkpoint.csv"220 221 # Load checkpoint if exists222 import os223 start_idx = 0224 if os.path.exists(checkpoint_file):225 checkpoint_df = pd.read_csv(checkpoint_file)226 if 'extracted_soft_skills_str' in checkpoint_df.columns:227 extracted_so_far = checkpoint_df['extracted_soft_skills_str'].notna().sum()228 start_idx = extracted_so_far229 print(f"Resuming from checkpoint at index {start_idx}")230 # Copy already processed rows231 extracted_soft_skills_str = checkpoint_df['extracted_soft_skills_str'].tolist()232 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_soft_skills_str)233 else:234 extracted_soft_skills_str = [None] * len(df)235 used_fallback = [False] * len(df)236 else:237 extracted_soft_skills_str = [None] * len(df)238 used_fallback = [False] * len(df)239
240 sys.stdout = log_file241 for idx in range(start_idx, len(processing_range)):242 row = processing_range.iloc[idx]243 print(f"\nJob {idx + 1}/{len(processing_range)}:")244
245 job_title = row[title_column] if title_column in df.columns else "Not specified"246 print(f"Job Title: {job_title}")247
248 # Combine relevant columns for more context249 combined_text = (250 str(row.get(job_type_column, '')) + ' ' +251 str(row.get(title_column, '')) + ' ' +252 str(row.get(job_details_column, '')) + ' ' +253 str(row.get(description_column, ''))254 ).strip()255
256 if combined_text:257 print("Processing combined text:")258 # Try with Ollama first259 soft_skills = extract_soft_skills_with_ollama(combined_text)260 261 # If Ollama fails or returns empty, try fallback262 if not soft_skills:263 print("Ollama extraction failed or returned empty. Using fallback extraction.")264 soft_skills = fallback_extract_soft_skills(combined_text)265 used_fallback[idx] = True266 267 all_soft_skills.append(soft_skills)268 extracted_soft_skills_str[idx] = (', '.join(soft_skills) if soft_skills else '')269 else:270 print("Combined text is empty for this job")271 all_soft_skills.append([])272 extracted_soft_skills_str[idx] = ''273
274 print("-" * 50)275 276 # Save checkpoint periodically277 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):278 print(f"Creating checkpoint at index {idx}")279 temp_df = df.copy()280 temp_df['extracted_soft_skills_str'] = extracted_soft_skills_str281 temp_df['used_fallback'] = used_fallback282 temp_df.to_csv(checkpoint_file, index=False)283 284 # Avoid overwhelming the local Ollama service285 time.sleep(random.uniform(0.5, 1.5))286
287 # Reset stdout and close log file288 sys.stdout = original_stdout289 log_file.close()290 291 df['extracted_soft_skills_str'] = extracted_soft_skills_str292 df['used_fallback'] = used_fallback293
294 df.to_csv(output_path, index=False)295 print(f"\nUpdated dataset saved to {output_path}")296
297 print("\n--- Sample Results ---")298 print(df[[title_column, 'extracted_soft_skills_str', 'used_fallback']].head(10))299 300 # Print summary statistics301 total_jobs = len(df)302 jobs_with_skills = sum(1 for skills in extracted_soft_skills_str if skills)303 jobs_using_fallback = sum(used_fallback)304 305 print(f"\n--- Summary Statistics ---")306 print(f"Total jobs processed: {total_jobs}")307 print(f"Jobs with extracted soft skills: {jobs_with_skills} ({jobs_with_skills/total_jobs*100:.1f}%)")308 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")309 print(f"\nDetailed logs saved to {log_path}")310 311 except Exception as e:312 sys.stdout = original_stdout313 log_file.close()314 print(f"Script error: {str(e)}")315 import traceback316 traceback.print_exc()317
318if __name__ == "__main__":319 main()