✅ Unlike Lesson 25 which only extracted tools, this version of code also saves results to seek_jobs_with_tools.csv.
✅ Loads job data from seek_jobs.csv and processes each job description.
✅ Uses Ollama's Gemma model to extract explicitly mentioned technical tools from the job text.
✅ Falls back to keyword-based extraction if the AI model fails or returns nothing.
✅ Saves the extracted tools and fallback status into new columns, along with checkpoints and logs.
✅ Automatically saves progress to a checkpoint file after every 50 job descriptions processed to ensure intermediate results are not lost.
1import json2import pandas as pd3import requests4import time5import random6from requests.exceptions import Timeout, ConnectionError7
8def extract_tools_with_ollama(job_text, max_retries=3, timeout=60):9 """10 Extract technical tools from job text using Ollama with Gemma model11 with improved error handling and retries12 """13
14 if pd.isna(job_text) or job_text == "":15 return []16
17 job_text_lower = job_text.lower()18
19 prompt = f"""20 You are a specialized tool extractor for job descriptions. Your task is to identify ONLY specific named technical tools, technologies, software, programming languages, platforms, and frameworks.21
22 STRICT RULES:23 1. Extract ONLY proper noun names of specific technologies24 2. Include ONLY: software names, programming languages, frameworks, platforms, databases, cloud services25 3. Do NOT include: general skills, methodologies, concepts, or descriptive terms26 4. Do NOT include adjectives or general terms like "analytical", "digital", "innovative"27 5. IMPORTANT: ONLY extract tools that are EXPLICITLY mentioned in the text28 6. If no specific tools are mentioned, return an empty array []29
30 CORRECT EXAMPLES:31 - INCLUDE: Python, Java, AWS, Azure, Excel, SQL, PowerBI, Tableau, Git, Docker, React, TensorFlow32 - DO NOT INCLUDE: analytical, technical, problem-solving, digital transformation, innovative33
34 Job Text:35 {job_text}36
37 Respond ONLY with a JSON array containing the list of specific technical tools found.38 Example response format: ["Python", "AWS", "Docker"]39 """40
41 for attempt in range(max_retries):42 try:43 print(f"API call attempt {attempt + 1}/{max_retries}")44 response = requests.post(45 "http://localhost:11434/api/generate",46 json={47 "model": "gemma3:1b",48 "prompt": prompt,49 "stream": False50 },51 timeout=timeout # Increased timeout52 )53
54 if response.status_code == 200:55 result = response.json()["response"]56
57 try:58 start_idx = result.find('[')59 end_idx = result.rfind(']') + 160
61 if start_idx >= 0 and end_idx > start_idx:62 json_str = result[start_idx:end_idx]63 tools = json.loads(json_str)64
65 non_tools = ["analytical", "technical", "digital", "innovative", "advanced",66 "solution", "problem-solving", "communication", "teamwork",67 "methodology", "approach", "strategy", "skill"]68
69 filtered_tools = [tool for tool in tools70 if tool.lower() not in [nt.lower() for nt in non_tools]71 and len(tool) > 172 and not tool.lower().endswith("ing")]73
74 verified_tools = []75 for tool in filtered_tools:76 if tool.lower() in job_text_lower:77 verified_tools.append(tool)78 elif tool.lower() == "javascript" and "js" in job_text_lower:79 verified_tools.append(tool)80 elif tool.lower() == "microsoft excel" and "excel" in job_text_lower:81 verified_tools.append("Excel")82 elif tool.lower() == "amazon web services" and "aws" in job_text_lower:83 verified_tools.append("AWS")84
85 print(f"Initial extraction: {tools}")86 print(f"After verification: {verified_tools}")87 return verified_tools88 else:89 print(f"Couldn't find JSON array in: {result}")90 return []91 except json.JSONDecodeError:92 print(f"Failed to parse JSON from: {result}")93 return []94 else:95 print(f"Error from Ollama API: {response.status_code} - {response.text}")96 # If it's a server error, retry97 if response.status_code >= 500:98 if attempt < max_retries - 1:99 # Add exponential backoff with jitter100 backoff_time = (2 ** attempt) + random.uniform(0, 1)101 print(f"Retrying in {backoff_time:.2f} seconds...")102 time.sleep(backoff_time)103 continue104 return []105 106 except (Timeout, ConnectionError) as e:107 print(f"Connection error on attempt {attempt + 1}: {str(e)}")108 if attempt < max_retries - 1:109 # Add exponential backoff with jitter110 backoff_time = (2 ** attempt) + random.uniform(0, 1)111 print(f"Retrying in {backoff_time:.2f} seconds...")112 time.sleep(backoff_time)113 else:114 print(f"Max retries exceeded. Moving on without tools extraction.")115 return []116 except Exception as e:117 print(f"Unexpected exception: {str(e)}")118 return []119 120 return []121
122def fallback_extract_tools(job_text):123 """124 Simple keyword-based tool extraction as fallback when Ollama fails125 """126 if pd.isna(job_text) or job_text == "":127 return []128 129 common_tools = [130 "Python", "Java", "JavaScript", "TypeScript", "C#", "C++", "Ruby", "PHP", "Go", "Rust",131 "AWS", "Azure", "GCP", "SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "Excel", "Word",132 "PowerPoint", "PowerBI", "Tableau", "Looker", "Git", "Docker", "Kubernetes", "Jenkins",133 "Jira", "Confluence", "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",134 "TensorFlow", "PyTorch", "Hadoop", "Spark", "Kafka", "Airflow", "Linux", "Windows", "MacOS",135 "SAP", "Salesforce", "ServiceNow", "Workday", "SharePoint", ".NET", "R", "MATLAB"136 ]137 138 job_text_lower = job_text.lower()139 found_tools = []140 141 for tool in common_tools:142 if tool.lower() in job_text_lower or (tool == "JavaScript" and "js" in job_text_lower):143 found_tools.append(tool)144 145 return found_tools146
147def main():148 print("✅ Script started")149
150 file_path = "seek_jobs.csv"151 output_path = "seek_jobs_with_tools.csv"152 log_path = "extraction_log.txt"153 154 # Setup logging to file155 import sys156 original_stdout = sys.stdout157 log_file = open(log_path, 'w')158 159 try:160 print(f"Reading dataset from {file_path}...")161 df = pd.read_csv(file_path)162 print(f"📄 Data loaded: {len(df)} rows")163
164 title_column = "job_title"165 description_column = "description"166 job_details_column = "job_details"167 job_type_column = "job_type"168
169 print(f"Total records: {len(df)}")170 print("Starting tools extraction...\n")171
172 all_tools = []173 extracted_tools_str = []174 used_fallback = []175
176 processing_range = df177
178 # Create a checkpoint system to save progress periodically179 checkpoint_interval = 50180 last_checkpoint = 0181 checkpoint_file = "extraction_checkpoint.csv"182 183 # Load checkpoint if exists184 import os185 start_idx = 0186 if os.path.exists(checkpoint_file):187 checkpoint_df = pd.read_csv(checkpoint_file)188 if 'extracted_tools_str' in checkpoint_df.columns:189 extracted_so_far = checkpoint_df['extracted_tools_str'].notna().sum()190 start_idx = extracted_so_far191 print(f"Resuming from checkpoint at index {start_idx}")192 # Copy already processed rows193 extracted_tools_str = checkpoint_df['extracted_tools_str'].tolist()194 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_tools_str)195 else:196 extracted_tools_str = [None] * len(df)197 used_fallback = [False] * len(df)198 else:199 extracted_tools_str = [None] * len(df)200 used_fallback = [False] * len(df)201
202 sys.stdout = log_file203 for idx in range(start_idx, len(processing_range)):204 row = processing_range.iloc[idx]205 print(f"\nJob {idx + 1}/{len(processing_range)}:")206
207 job_title = row[title_column] if title_column in df.columns else "Not specified"208 print(f"Job Title: {job_title}")209
210 combined_text = (211 str(row.get(job_type_column, '')) + ' ' +212 str(row.get(title_column, '')) + ' ' +213 str(row.get(job_details_column, '')) + ' ' +214 str(row.get(description_column, ''))215 ).strip()216
217 if combined_text:218 print("Processing combined text:")219 # Try with Ollama first220 tools = extract_tools_with_ollama(combined_text)221 222 # If Ollama fails or returns empty, try fallback223 if not tools:224 print("Ollama extraction failed or returned empty. Using fallback extraction.")225 tools = fallback_extract_tools(combined_text)226 used_fallback[idx] = True227 228 all_tools.append(tools)229 extracted_tools_str[idx] = (', '.join(tools) if tools else '')230 else:231 print("Combined text is empty for this job")232 all_tools.append([])233 extracted_tools_str[idx] = ''234
235 print("-" * 50)236 237 # Save checkpoint periodically238 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):239 print(f"Creating checkpoint at index {idx}")240 temp_df = df.copy()241 temp_df['extracted_tools_str'] = extracted_tools_str242 temp_df['used_fallback'] = used_fallback243 temp_df.to_csv(checkpoint_file, index=False)244 245 # Avoid overwhelming the local Ollama service246 time.sleep(random.uniform(0.5, 1.5))247
248 # Reset stdout and close log file249 sys.stdout = original_stdout250 log_file.close()251 252 df['extracted_tools_str'] = extracted_tools_str253 df['used_fallback'] = used_fallback254
255 df.to_csv(output_path, index=False)256 print(f"\nUpdated dataset saved to {output_path}")257
258 print("\n--- Sample Results ---")259 print(df[[title_column, 'extracted_tools_str', 'used_fallback']].head(10))260 261 # Print summary statistics262 total_jobs = len(df)263 jobs_with_tools = sum(1 for tools in extracted_tools_str if tools)264 jobs_using_fallback = sum(used_fallback)265 266 print(f"\n--- Summary Statistics ---")267 print(f"Total jobs processed: {total_jobs}")268 print(f"Jobs with extracted tools: {jobs_with_tools} ({jobs_with_tools/total_jobs*100:.1f}%)")269 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")270 print(f"\nDetailed logs saved to {log_path}")271 272 except Exception as e:273 sys.stdout = original_stdout274 log_file.close()275 print(f"Script error: {str(e)}")276 import traceback277 traceback.print_exc()278
279if __name__ == "__main__":280 main()