✅ Compared to previous lesson no. 28, this script improves on soft skill extraction by verifying results, saving it in seek_jobs_with_languages.csv and handling AI/API failures gracefully.
✅ Extracts programming, scripting, and markup languages from job descriptions using Ollama’s Gemma model, with strict rule-based prompting.
✅ Implements robust error handling, retry logic, and exponential backoff for reliable API calls to the local Ollama service.
✅ Falls back to a keyword- and synonym-based method when AI extraction fails, ensuring no job is skipped.
✅ Verifies each extracted language against the original job description text to avoid false positives.
✅ Automatically saves progress to a checkpoint file after every 50 jobs to enable resumption and prevent data loss.
1import json2import pandas as pd3import requests4import time5import random6from requests.exceptions import Timeout, ConnectionError7
8def extract_programming_languages_with_ollama(job_description, max_retries=3, timeout=60):9 """10 Extract programming languages from job description using Ollama with Gemma model11 with improved error handling and retries12 """13 # Handle missing or NaN values14 if pd.isna(job_description) or job_description == "":15 return []16 17 job_description_lower = job_description.lower()18 19 prompt = f"""20 You are a specialized programming languages extractor for job descriptions. Your task is to identify ONLY programming, scripting, and markup languages mentioned in the job description.21
22 STRICT RULES:23 1. Extract ONLY programming, scripting, and markup languages24 2. Include ONLY: languages like Python, Java, JavaScript, TypeScript, C#, C++, Ruby, PHP, Swift, Kotlin, Go, SQL, HTML, CSS, R, Scala, Rust, MATLAB, Perl, Bash, PowerShell, etc.25 3. Do NOT include: frameworks (React, Angular), libraries (jQuery, TensorFlow), tools (Git, Docker), platforms (AWS, Azure), or methodologies (Agile, Scrum)26 4. Do NOT include job titles, responsibilities, or soft skills27 5. IMPORTANT: ONLY extract languages that are EXPLICITLY mentioned in the text28 6. If no programming languages are mentioned, return an empty array []29 7. Recognize language variations (e.g., "JS" = "JavaScript", "TS" = "TypeScript")30
31 CORRECT EXAMPLES:32 - INCLUDE: "Python", "Java", "C#", "JavaScript", "TypeScript", "SQL", "HTML", "CSS"33 - DO NOT INCLUDE: "React", "Angular", "Node.js", "Django", "Git", "AWS", "Agile"34
35 Job Description:36 {job_description}37
38 Respond ONLY with a JSON array containing the list of programming languages found.39 Example response format: ["Python", "JavaScript", "SQL"]40 """41 42 # Make request to local Ollama API with retries and backoff43 for attempt in range(max_retries):44 try:45 print(f"API call attempt {attempt + 1}/{max_retries}")46 response = requests.post(47 "http://localhost:11434/api/generate",48 json={49 "model": "gemma3:1b",50 "prompt": prompt,51 "stream": False52 },53 timeout=timeout # Increased timeout54 )55 56 if response.status_code == 200:57 result = response.json()["response"]58 59 # Extract JSON array from the response60 try:61 # Try to find JSON array in the response62 start_idx = result.find('[')63 end_idx = result.rfind(']') + 164 65 if start_idx >= 0 and end_idx > start_idx:66 json_str = result[start_idx:end_idx]67 languages = json.loads(json_str)68 69 # Additional filtering to remove non-language items70 non_languages = ["react", "angular", "node.js", "django", "flask", "spring", "express", 71 "vue", "scrum", "agile", "aws", "azure", "docker", "kubernetes", 72 "git", "jenkins", "jira", "agile", "scrum", "kanban", "teamwork"]73 74 # Filter out any items that are likely not programming languages75 filtered_languages = [lang for lang in languages 76 if lang.lower() not in [nl.lower() for nl in non_languages]77 and len(lang) > 1] # Avoid single characters except for specific cases like R78 79 # Special case for single character languages like R80 if "R" in languages and "R" not in filtered_languages:81 filtered_languages.append("R")82 83 # CRITICAL: Verify each language actually appears in the job description84 verified_languages = []85 86 # Common variations and synonyms for programming languages87 language_synonyms = {88 "javascript": ["js", "javascript", "ecmascript"],89 "typescript": ["ts", "typescript"],90 "python": ["py", "python3", "python 3", "python2", "python 2"],91 "java": ["java", "jvm"],92 "c#": ["c#", "csharp", "c sharp"],93 "c++": ["c++", "cpp", "cplusplus", "c plus plus"],94 "ruby": ["rb", "ruby on rails", "rails"],95 "php": ["php"],96 "swift": ["swift"],97 "kotlin": ["kt", "kotlin"],98 "go": ["golang", "go language"],99 "sql": ["sql", "tsql", "plsql", "mysql", "postgresql", "oracle sql", "sql server"],100 "html": ["html", "html5", "html 5"],101 "css": ["css", "css3", "css 3", "scss", "sass"],102 "r": ["r programming", "r language"],103 "scala": ["scala"],104 "rust": ["rust", "rust lang"],105 "matlab": ["matlab"],106 "perl": ["perl"],107 "bash": ["bash", "shell", "shell script", "shell scripting"],108 "powershell": ["powershell", "power shell"],109 "vba": ["visual basic", "visual basic for applications"],110 "groovy": ["groovy"],111 "fortran": ["fortran"]112 }113 114 for language in filtered_languages:115 language_lower = language.lower()116 117 # Check if the language directly appears in the job description118 if language_lower in job_description_lower:119 verified_languages.append(language)120 continue121 122 # Check for synonyms and variations123 for main_lang, synonyms in language_synonyms.items():124 if language_lower == main_lang or language_lower in synonyms:125 # Check if any synonyms appear in the text126 if any(syn in job_description_lower for syn in synonyms) or main_lang in job_description_lower:127 # Standardize the language name with proper capitalization128 if main_lang == "javascript":129 verified_languages.append("JavaScript")130 elif main_lang == "typescript":131 verified_languages.append("TypeScript")132 elif main_lang == "c#":133 verified_languages.append("C#")134 elif main_lang == "c++":135 verified_languages.append("C++")136 elif main_lang == "php":137 verified_languages.append("PHP")138 elif main_lang == "sql":139 verified_languages.append("SQL")140 elif main_lang == "html":141 verified_languages.append("HTML")142 elif main_lang == "css":143 verified_languages.append("CSS")144 elif main_lang == "r":145 verified_languages.append("R")146 elif main_lang == "vba":147 verified_languages.append("VBA")148 else:149 verified_languages.append(main_lang.title()) # Use standardized version150 break151 152 # Handle special case for "C" language to avoid false positives153 if "C" in languages and not any(lang in verified_languages for lang in ["C++", "C#"]):154 # Verify it's actually the C language with context patterns155 c_language_patterns = ["c programming", "c language", "ansi c", 156 "programming in c", "code in c", "c developer"]157 if any(pattern in job_description_lower for pattern in c_language_patterns):158 verified_languages.append("C")159 160 print(f"Initial extraction: {languages}")161 print(f"After verification: {verified_languages}")162 return verified_languages163 else:164 print(f"Couldn't find JSON array in: {result}")165 if attempt < max_retries - 1:166 # Add exponential backoff with jitter167 backoff_time = (2 ** attempt) + random.uniform(0, 1)168 print(f"Retrying in {backoff_time:.2f} seconds...")169 time.sleep(backoff_time)170 continue171 return []172 except json.JSONDecodeError:173 print(f"Failed to parse JSON from: {result}")174 if attempt < max_retries - 1:175 # Add exponential backoff with jitter176 backoff_time = (2 ** attempt) + random.uniform(0, 1)177 print(f"Retrying in {backoff_time:.2f} seconds...")178 time.sleep(backoff_time)179 continue180 return []181 else:182 print(f"Error from Ollama API: {response.status_code} - {response.text}")183 # If it's a server error, retry184 if response.status_code >= 500:185 if attempt < max_retries - 1:186 # Add exponential backoff with jitter187 backoff_time = (2 ** attempt) + random.uniform(0, 1)188 print(f"Retrying in {backoff_time:.2f} seconds...")189 time.sleep(backoff_time)190 continue191 return []192 193 except (Timeout, ConnectionError) as e:194 print(f"Connection error on attempt {attempt + 1}: {str(e)}")195 if attempt < max_retries - 1:196 # Add exponential backoff with jitter197 backoff_time = (2 ** attempt) + random.uniform(0, 1)198 print(f"Retrying in {backoff_time:.2f} seconds...")199 time.sleep(backoff_time)200 else:201 print(f"Max retries exceeded. Moving on without languages extraction.")202 return []203 except Exception as e:204 print(f"Unexpected exception: {str(e)}")205 return []206 207 return []208
209def fallback_extract_languages(job_text):210 """211 Simple keyword-based programming language extraction as fallback when Ollama fails212 """213 if pd.isna(job_text) or job_text == "":214 return []215 216 common_languages = [217 "Python", "Java", "JavaScript", "TypeScript", "C#", "C++", "C", "Ruby", "PHP", "Go", "Rust",218 "SQL", "HTML", "CSS", "R", "Scala", "Swift", "Kotlin", "MATLAB", "Perl", "Bash", 219 "PowerShell", "VBA", "Groovy", "Fortran", "Lua", "Haskell", "Clojure", "Erlang", "F#",220 "COBOL", "Assembly", "Dart", "Objective-C", "Julia", "Lisp", "Scheme", "Prolog"221 ]222 223 language_synonyms = {224 "JavaScript": ["js", "ecmascript"],225 "TypeScript": ["ts"],226 "Python": ["py", "python3", "python2"],227 "C#": ["csharp", "c sharp"],228 "C++": ["cpp", "cplusplus", "c plus plus"],229 "Ruby": ["rb"],230 "SQL": ["mysql", "postgresql", "tsql", "plsql", "oracle sql", "sql server"],231 "HTML": ["html5"],232 "CSS": ["css3", "scss", "sass"],233 "R": ["r programming", "r language"],234 "Bash": ["shell", "shell script", "shell scripting"],235 "PowerShell": ["power shell"],236 "VBA": ["visual basic", "visual basic for applications"]237 }238 239 job_text_lower = job_text.lower()240 found_languages = []241 242 # First check for direct matches243 for language in common_languages:244 if language.lower() in job_text_lower:245 found_languages.append(language)246 continue247 248 # Then check synonyms249 if language in language_synonyms:250 for synonym in language_synonyms[language]:251 if synonym.lower() in job_text_lower:252 found_languages.append(language)253 break254 255 # Special handling for C language to avoid false positives256 if "C" in found_languages and ("C++" in found_languages or "C#" in found_languages):257 # Verify it's actually the C language with context patterns258 c_language_patterns = ["c programming", "c language", "ansi c", 259 "programming in c", "code in c", "c developer"]260 if not any(pattern in job_text_lower for pattern in c_language_patterns):261 found_languages.remove("C")262 263 return found_languages264
265def main():266 print("✅ Script started")267
268 file_path = "seek_jobs.csv"269 output_path = "seek_jobs_with_languages.csv"270 log_path = "language_extraction_log.txt"271 272 # Setup logging to file273 import sys274 original_stdout = sys.stdout275 log_file = open(log_path, 'w')276 277 try:278 print(f"Reading dataset from {file_path}...")279 df = pd.read_csv(file_path)280 print(f"📄 Data loaded: {len(df)} rows")281
282 # Columns to extract programming languages from283 title_column = "job_title"284 description_column = "description"285 job_details_column = "job_details"286 job_type_column = "job_type"287
288 print(f"Total records: {len(df)}")289 print("Starting programming languages extraction...\n")290
291 all_languages = []292 extracted_languages_str = []293 used_fallback = []294
295 processing_range = df296
297 # Create a checkpoint system to save progress periodically298 checkpoint_interval = 50299 checkpoint_file = "language_extraction_checkpoint.csv"300 301 # Load checkpoint if exists302 import os303 start_idx = 0304 if os.path.exists(checkpoint_file):305 checkpoint_df = pd.read_csv(checkpoint_file)306 if 'extracted_programming_languages_str' in checkpoint_df.columns:307 extracted_so_far = checkpoint_df['extracted_programming_languages_str'].notna().sum()308 start_idx = extracted_so_far309 print(f"Resuming from checkpoint at index {start_idx}")310 # Copy already processed rows311 extracted_languages_str = checkpoint_df['extracted_programming_languages_str'].tolist()312 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_languages_str)313 else:314 extracted_languages_str = [None] * len(df)315 used_fallback = [False] * len(df)316 else:317 extracted_languages_str = [None] * len(df)318 used_fallback = [False] * len(df)319
320 sys.stdout = log_file321 for idx in range(start_idx, len(processing_range)):322 row = processing_range.iloc[idx]323 print(f"\nJob {idx + 1}/{len(processing_range)}:")324
325 job_title = row[title_column] if title_column in df.columns else "Not specified"326 print(f"Job Title: {job_title}")327
328 # Combine text from multiple columns329 combined_text = (330 str(row.get(job_type_column, '')) + ' ' +331 str(row.get(title_column, '')) + ' ' +332 str(row.get(job_details_column, '')) + ' ' +333 str(row.get(description_column, ''))334 ).strip()335
336 if combined_text:337 print("Processing combined text:")338 # Try with Ollama first339 languages = extract_programming_languages_with_ollama(combined_text)340 341 # If Ollama fails or returns empty, try fallback342 if not languages:343 print("Ollama extraction failed or returned empty. Using fallback extraction.")344 languages = fallback_extract_languages(combined_text)345 used_fallback[idx] = True346 347 all_languages.append(languages)348 extracted_languages_str[idx] = (', '.join(languages) if languages else '')349 else:350 print("Combined text is empty for this job")351 all_languages.append([])352 extracted_languages_str[idx] = ''353
354 print("-" * 50)355 356 # Save checkpoint periodically357 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):358 print(f"Creating checkpoint at index {idx}")359 temp_df = df.copy()360 temp_df['extracted_programming_languages_str'] = extracted_languages_str361 temp_df['used_fallback'] = used_fallback362 temp_df.to_csv(checkpoint_file, index=False)363 364 # Avoid overwhelming the local Ollama service365 time.sleep(random.uniform(0.5, 1.5))366
367 # Reset stdout and close log file368 sys.stdout = original_stdout369 log_file.close()370 371 df['extracted_programming_languages_str'] = extracted_languages_str372 df['used_fallback'] = used_fallback373
374 df.to_csv(output_path, index=False)375 print(f"\nUpdated dataset saved to {output_path}")376
377 print("\n--- Sample Results ---")378 print(df[[title_column, 'extracted_programming_languages_str', 'used_fallback']].head(10))379 380 # Print summary statistics381 total_jobs = len(df)382 jobs_with_languages = sum(1 for langs in extracted_languages_str if langs)383 jobs_using_fallback = sum(used_fallback)384 385 print(f"\n--- Summary Statistics ---")386 print(f"Total jobs processed: {total_jobs}")387 print(f"Jobs with extracted languages: {jobs_with_languages} ({jobs_with_languages/total_jobs*100:.1f}%)")388 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")389 print(f"\nDetailed logs saved to {log_path}")390 391 except Exception as e:392 sys.stdout = original_stdout393 log_file.close()394 print(f"Script error: {str(e)}")395 import traceback396 traceback.print_exc()397
398if __name__ == "__main__":399 main()