✅ Compared to previous lesson no. 27, this script improves on soft skill extraction by verifying results, saving it in seek_jobs_with_methods_frameworks.csv and handling AI/API failures gracefully.
✅ Extracts project management methods and frameworks using Ollama's Gemma model, with strict rules and synonym-based validation.
✅ Includes robust retry logic, timeout handling, and a fallback keyword-based extractor to handle LLM/API failures.
✅ Verifies extracted items by checking for their exact or synonymous presence in the job description text.
✅ Automatically saves progress to a checkpoint file after every 50 job descriptions processed to ensure intermediate results are not lost.
1import json2import pandas as pd3import requests4import time5import random6from requests.exceptions import Timeout, ConnectionError7
8def extract_methods_frameworks_with_ollama(job_description, max_retries=3, timeout=60):9 """10 Extract methods and frameworks from job description using Ollama with Gemma model11 with improved error handling and retries12 """13 # Handle missing or NaN values14 if pd.isna(job_description) or job_description == "":15 return []16 17 job_description_lower = job_description.lower()18 19 prompt = f"""20 You are a specialized methods and frameworks extractor for job descriptions. Your task is to identify ONLY project management methodologies, development frameworks, and working approaches mentioned in the job description.21
22 STRICT RULES:23 1. Extract ONLY methods, frameworks, and methodologies 24 2. Include ONLY: project methodologies, development frameworks, process frameworks, working approaches25 3. Examples include: Scrum, Agile, Kanban, Waterfall, Lean, Six Sigma, DevOps, SAFe, PRINCE2, ITIL, XP (Extreme Programming)26 4. Do NOT include: programming languages, job titles, soft skills, or technical tools (unless they are actual methodologies)27 5. IMPORTANT: ONLY extract methods/frameworks that are EXPLICITLY mentioned in the text28 6. If no methods or frameworks are mentioned, return an empty array []29
30 CORRECT EXAMPLES:31 - INCLUDE: Scrum, Agile, Kanban, Waterfall, Lean, Design Thinking, OKRs, PRINCE232 - DO NOT INCLUDE: Python, Excel, leadership, communication, teamwork, SQL33
34 Job Description:35 {job_description}36
37 Respond ONLY with a JSON array containing the list of methods and frameworks found.38 Example response format: ["Agile", "Scrum", "Kanban"]39 """40 41 # Make request to local Ollama API with retries42 for attempt in range(max_retries):43 try:44 print(f"API call attempt {attempt + 1}/{max_retries}")45 response = requests.post(46 "http://localhost:11434/api/generate",47 json={48 "model": "gemma3:1b",49 "prompt": prompt,50 "stream": False51 },52 timeout=timeout # Increased timeout53 )54 55 if response.status_code == 200:56 result = response.json()["response"]57 58 # Extract JSON array from the response59 try:60 # Try to find JSON array in the response61 start_idx = result.find('[')62 end_idx = result.rfind(']') + 163 64 if start_idx >= 0 and end_idx > start_idx:65 json_str = result[start_idx:end_idx]66 methods_frameworks = json.loads(json_str)67 68 # Additional filtering to remove non-methodology items69 non_methods = ["python", "java", "excel", "sql", "leadership", "communication", 70 "teamwork", "degree", "bachelor", "master", "certification"]71 72 # Filter out any items that are likely not methods/frameworks73 filtered_methods = [method for method in methods_frameworks 74 if method.lower() not in [nm.lower() for nm in non_methods]75 and len(method) > 1] # Avoid single characters76 77 # CRITICAL: Verify each method/framework actually appears in the job description78 verified_methods = []79 80 # Common variations and synonyms for methods and frameworks81 method_synonyms = {82 "agile": ["agile methodology", "agile environment", "agile principles", "agile practices"],83 "scrum": ["scrum master", "scrum team", "scrum methodology", "scrum framework"],84 "kanban": ["kanban board", "kanban method", "kanban methodology"],85 "waterfall": ["waterfall model", "waterfall methodology", "waterfall approach"],86 "lean": ["lean methodology", "lean principles", "lean thinking", "lean management"],87 "six sigma": ["6 sigma", "six-sigma", "6-sigma", "lean six sigma"],88 "devops": ["dev ops", "dev-ops", "devops practices", "devops culture"],89 "safe": ["scaled agile framework", "safe framework", "safe methodology"],90 "prince2": ["prince 2", "prince-2", "projects in controlled environments"],91 "itil": ["information technology infrastructure library", "itil framework", "itil processes"],92 "extreme programming": ["xp", "xp methodology", "extreme programming practices"],93 "design thinking": ["design-thinking", "design thinking methodology"],94 "okrs": ["objectives and key results", "okr framework", "okr methodology"]95 }96 97 for method in filtered_methods:98 method_lower = method.lower()99 100 # Check if the method directly appears in the job description101 if method_lower in job_description_lower:102 verified_methods.append(method)103 continue104 105 # Check for synonyms and variations106 for main_method, synonyms in method_synonyms.items():107 if method_lower == main_method or method_lower in synonyms:108 # Check if any synonyms appear in the text109 if any(syn in job_description_lower for syn in synonyms) or main_method in job_description_lower:110 verified_methods.append(main_method.title()) # Use standardized version111 break112 113 print(f"Initial extraction: {methods_frameworks}")114 print(f"After verification: {verified_methods}")115 return verified_methods116 else:117 print(f"Couldn't find JSON array in: {result}")118 if attempt < max_retries - 1:119 # Add exponential backoff with jitter120 backoff_time = (2 ** attempt) + random.uniform(0, 1)121 print(f"Retrying in {backoff_time:.2f} seconds...")122 time.sleep(backoff_time)123 continue124 return []125 except json.JSONDecodeError:126 print(f"Failed to parse JSON from: {result}")127 if attempt < max_retries - 1:128 # Add exponential backoff with jitter129 backoff_time = (2 ** attempt) + random.uniform(0, 1)130 print(f"Retrying in {backoff_time:.2f} seconds...")131 time.sleep(backoff_time)132 continue133 return []134 else:135 print(f"Error from Ollama API: {response.status_code} - {response.text}")136 # If it's a server error, retry137 if response.status_code >= 500:138 if attempt < max_retries - 1:139 # Add exponential backoff with jitter140 backoff_time = (2 ** attempt) + random.uniform(0, 1)141 print(f"Retrying in {backoff_time:.2f} seconds...")142 time.sleep(backoff_time)143 continue144 return []145 146 except (Timeout, ConnectionError) as e:147 print(f"Connection error on attempt {attempt + 1}: {str(e)}")148 if attempt < max_retries - 1:149 # Add exponential backoff with jitter150 backoff_time = (2 ** attempt) + random.uniform(0, 1)151 print(f"Retrying in {backoff_time:.2f} seconds...")152 time.sleep(backoff_time)153 else:154 print(f"Max retries exceeded. Moving on without methods/frameworks extraction.")155 return []156 except Exception as e:157 print(f"Unexpected exception: {str(e)}")158 return []159 160 return []161
162def fallback_extract_methods_frameworks(job_text):163 """164 Simple keyword-based methods and frameworks extraction as fallback when Ollama fails165 """166 if pd.isna(job_text) or job_text == "":167 return []168 169 common_methods = [170 "Agile", "Scrum", "Kanban", "Waterfall", "Lean", "Six Sigma", "DevOps", 171 "SAFe", "PRINCE2", "ITIL", "XP", "Extreme Programming", "Design Thinking", 172 "OKRs", "Objectives and Key Results", "TDD", "BDD", "Test Driven Development",173 "Behavior Driven Development", "FDD", "Feature Driven Development", "RUP",174 "Rational Unified Process", "Crystal", "DSDM", "Dynamic Systems Development Method",175 "LeSS", "Large-Scale Scrum", "DAD", "Disciplined Agile Delivery", "AUP",176 "Agile Unified Process", "PMBOK", "Project Management Body of Knowledge",177 "RAD", "Rapid Application Development", "MSF", "Microsoft Solutions Framework"178 ]179 180 job_text_lower = job_text.lower()181 found_methods = []182 183 # Common variations and synonyms for methods and frameworks184 method_synonyms = {185 "agile": ["agile methodology", "agile environment", "agile principles", "agile practices"],186 "scrum": ["scrum master", "scrum team", "scrum methodology", "scrum framework"],187 "kanban": ["kanban board", "kanban method", "kanban methodology"],188 "waterfall": ["waterfall model", "waterfall methodology", "waterfall approach"],189 "lean": ["lean methodology", "lean principles", "lean thinking", "lean management"],190 "six sigma": ["6 sigma", "six-sigma", "6-sigma", "lean six sigma"],191 "devops": ["dev ops", "dev-ops", "devops practices", "devops culture"],192 "safe": ["scaled agile framework", "safe framework", "safe methodology"],193 "prince2": ["prince 2", "prince-2", "projects in controlled environments"],194 "itil": ["information technology infrastructure library", "itil framework", "itil processes"],195 "extreme programming": ["xp", "xp methodology", "extreme programming practices"],196 "design thinking": ["design-thinking", "design thinking methodology"],197 "okrs": ["objectives and key results", "okr framework", "okr methodology"]198 }199 200 for method in common_methods:201 method_lower = method.lower()202 203 # Direct match204 if method_lower in job_text_lower:205 found_methods.append(method)206 continue207 208 # Check synonyms209 for main_method, synonyms in method_synonyms.items():210 if method_lower == main_method or method_lower in synonyms:211 if any(syn in job_text_lower for syn in synonyms) or main_method in job_text_lower:212 found_methods.append(main_method.title())213 break214 215 return list(set(found_methods)) # Remove duplicates216
217def main():218 print("✅ Script started")219
220 file_path = "seek_jobs.csv"221 output_path = "seek_jobs_with_methods_frameworks.csv"222 log_path = "frameworks_extraction_log.txt"223 224 # Setup logging to file225 import sys226 original_stdout = sys.stdout227 log_file = open(log_path, 'w')228 229 try:230 print(f"Reading dataset from {file_path}...")231 df = pd.read_csv(file_path)232 print(f"📄 Data loaded: {len(df)} rows")233
234 title_column = "job_title"235 description_column = "description"236 job_details_column = "job_details"237 job_type_column = "job_type"238
239 print(f"Total records: {len(df)}")240 print("Starting methods and frameworks extraction...\n")241
242 all_methods_frameworks = []243 extracted_methods_frameworks_str = []244 used_fallback = []245
246 processing_range = df247
248 # Create a checkpoint system to save progress periodically249 checkpoint_interval = 50250 checkpoint_file = "frameworks_extraction_checkpoint.csv"251 252 # Load checkpoint if exists253 import os254 start_idx = 0255 if os.path.exists(checkpoint_file):256 checkpoint_df = pd.read_csv(checkpoint_file)257 if 'extracted_methods_frameworks_str' in checkpoint_df.columns:258 extracted_so_far = checkpoint_df['extracted_methods_frameworks_str'].notna().sum()259 start_idx = extracted_so_far260 print(f"Resuming from checkpoint at index {start_idx}")261 # Copy already processed rows262 extracted_methods_frameworks_str = checkpoint_df['extracted_methods_frameworks_str'].tolist()263 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_methods_frameworks_str)264 else:265 extracted_methods_frameworks_str = [None] * len(df)266 used_fallback = [False] * len(df)267 else:268 extracted_methods_frameworks_str = [None] * len(df)269 used_fallback = [False] * len(df)270
271 sys.stdout = log_file272 for idx in range(start_idx, len(processing_range)):273 row = processing_range.iloc[idx]274 print(f"\nJob {idx + 1}/{len(processing_range)}:")275
276 job_title = row[title_column] if title_column in df.columns else "Not specified"277 print(f"Job Title: {job_title}")278
279 # Combine text from multiple columns280 combined_text = (281 str(row.get(job_type_column, '')) + ' ' +282 str(row.get(title_column, '')) + ' ' +283 str(row.get(job_details_column, '')) + ' ' +284 str(row.get(description_column, ''))285 ).strip()286
287 if combined_text:288 print("Processing combined text:")289 # Try with Ollama first290 methods_frameworks = extract_methods_frameworks_with_ollama(combined_text)291 292 # If Ollama fails or returns empty, try fallback293 if not methods_frameworks:294 print("Ollama extraction failed or returned empty. Using fallback extraction.")295 methods_frameworks = fallback_extract_methods_frameworks(combined_text)296 used_fallback[idx] = True297 298 all_methods_frameworks.append(methods_frameworks)299 extracted_methods_frameworks_str[idx] = (', '.join(methods_frameworks) if methods_frameworks else '')300 else:301 print("Combined text is empty for this job")302 all_methods_frameworks.append([])303 extracted_methods_frameworks_str[idx] = ''304
305 print("-" * 50)306 307 # Save checkpoint periodically308 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):309 print(f"Creating checkpoint at index {idx}")310 temp_df = df.copy()311 temp_df['extracted_methods_frameworks_str'] = extracted_methods_frameworks_str312 temp_df['used_fallback'] = used_fallback313 temp_df.to_csv(checkpoint_file, index=False)314 315 # Avoid overwhelming the local Ollama service316 time.sleep(random.uniform(0.5, 1.5))317
318 # Reset stdout and close log file319 sys.stdout = original_stdout320 log_file.close()321 322 df['extracted_methods_frameworks_str'] = extracted_methods_frameworks_str323 df['used_fallback'] = used_fallback324
325 df.to_csv(output_path, index=False)326 print(f"\nUpdated dataset saved to {output_path}")327
328 print("\n--- Sample Results ---")329 print(df[[title_column, 'extracted_methods_frameworks_str', 'used_fallback']].head(10))330 331 # Print summary statistics332 total_jobs = len(df)333 jobs_with_methods = sum(1 for methods in extracted_methods_frameworks_str if methods)334 jobs_using_fallback = sum(used_fallback)335 336 print(f"\n--- Summary Statistics ---")337 print(f"Total jobs processed: {total_jobs}")338 print(f"Jobs with extracted methods/frameworks: {jobs_with_methods} ({jobs_with_methods/total_jobs*100:.1f}%)")339 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")340 print(f"\nDetailed logs saved to {log_path}")341 342 except Exception as e:343 sys.stdout = original_stdout344 log_file.close()345 print(f"Script error: {str(e)}")346 import traceback347 traceback.print_exc()348
349if __name__ == "__main__":350 main()