AusBiz Consulting

✅ Unlike Lesson 25 which only extracted tools, this version of code also saves results to seek_jobs_with_tools.csv.

✅ Loads job data from seek_jobs.csv and processes each job description.

✅ Uses Ollama's Gemma model to extract explicitly mentioned technical tools from the job text.

✅ Falls back to keyword-based extraction if the AI model fails or returns nothing.

✅ Saves the extracted tools and fallback status into new columns, along with checkpoints and logs.

✅ Automatically saves progress to a checkpoint file after every 50 job descriptions processed to ensure intermediate results are not lost.

Python

1import json
2import pandas as pd
3import requests
4import time
5import random
6from requests.exceptions import Timeout, ConnectionError
7
8def extract_tools_with_ollama(job_text, max_retries=3, timeout=60):
9    """
10    Extract technical tools from job text using Ollama with Gemma model
11    with improved error handling and retries
12    """
13
14    if pd.isna(job_text) or job_text == "":
15        return []
16
17    job_text_lower = job_text.lower()
18
19    prompt = f"""
20    You are a specialized tool extractor for job descriptions. Your task is to identify ONLY specific named technical tools, technologies, software, programming languages, platforms, and frameworks.
21
22    STRICT RULES:
23    1. Extract ONLY proper noun names of specific technologies
24    2. Include ONLY: software names, programming languages, frameworks, platforms, databases, cloud services
25    3. Do NOT include: general skills, methodologies, concepts, or descriptive terms
26    4. Do NOT include adjectives or general terms like "analytical", "digital", "innovative"
27    5. IMPORTANT: ONLY extract tools that are EXPLICITLY mentioned in the text
28    6. If no specific tools are mentioned, return an empty array []
29
30    CORRECT EXAMPLES:
31    - INCLUDE: Python, Java, AWS, Azure, Excel, SQL, PowerBI, Tableau, Git, Docker, React, TensorFlow
32    - DO NOT INCLUDE: analytical, technical, problem-solving, digital transformation, innovative
33
34    Job Text:
35    {job_text}
36
37    Respond ONLY with a JSON array containing the list of specific technical tools found.
38    Example response format: ["Python", "AWS", "Docker"]
39    """
40
41    for attempt in range(max_retries):
42        try:
43            print(f"API call attempt {attempt + 1}/{max_retries}")
44            response = requests.post(
45                "http://localhost:11434/api/generate",
46                json={
47                    "model": "gemma3:1b",
48                    "prompt": prompt,
49                    "stream": False
50                },
51                timeout=timeout  # Increased timeout
52            )
53
54            if response.status_code == 200:
55                result = response.json()["response"]
56
57                try:
58                    start_idx = result.find('[')
59                    end_idx = result.rfind(']') + 1
60
61                    if start_idx >= 0 and end_idx > start_idx:
62                        json_str = result[start_idx:end_idx]
63                        tools = json.loads(json_str)
64
65                        non_tools = ["analytical", "technical", "digital", "innovative", "advanced",
66                                     "solution", "problem-solving", "communication", "teamwork",
67                                     "methodology", "approach", "strategy", "skill"]
68
69                        filtered_tools = [tool for tool in tools
70                                          if tool.lower() not in [nt.lower() for nt in non_tools]
71                                          and len(tool) > 1
72                                          and not tool.lower().endswith("ing")]
73
74                        verified_tools = []
75                        for tool in filtered_tools:
76                            if tool.lower() in job_text_lower:
77                                verified_tools.append(tool)
78                            elif tool.lower() == "javascript" and "js" in job_text_lower:
79                                verified_tools.append(tool)
80                            elif tool.lower() == "microsoft excel" and "excel" in job_text_lower:
81                                verified_tools.append("Excel")
82                            elif tool.lower() == "amazon web services" and "aws" in job_text_lower:
83                                verified_tools.append("AWS")
84
85                        print(f"Initial extraction: {tools}")
86                        print(f"After verification: {verified_tools}")
87                        return verified_tools
88                    else:
89                        print(f"Couldn't find JSON array in: {result}")
90                        return []
91                except json.JSONDecodeError:
92                    print(f"Failed to parse JSON from: {result}")
93                    return []
94            else:
95                print(f"Error from Ollama API: {response.status_code} - {response.text}")
96                # If it's a server error, retry
97                if response.status_code >= 500:
98                    if attempt < max_retries - 1:
99                        # Add exponential backoff with jitter
100                        backoff_time = (2 ** attempt) + random.uniform(0, 1)
101                        print(f"Retrying in {backoff_time:.2f} seconds...")
102                        time.sleep(backoff_time)
103                        continue
104                return []
105                
106        except (Timeout, ConnectionError) as e:
107            print(f"Connection error on attempt {attempt + 1}: {str(e)}")
108            if attempt < max_retries - 1:
109                # Add exponential backoff with jitter
110                backoff_time = (2 ** attempt) + random.uniform(0, 1)
111                print(f"Retrying in {backoff_time:.2f} seconds...")
112                time.sleep(backoff_time)
113            else:
114                print(f"Max retries exceeded. Moving on without tools extraction.")
115                return []
116        except Exception as e:
117            print(f"Unexpected exception: {str(e)}")
118            return []
119    
120    return []
121
122def fallback_extract_tools(job_text):
123    """
124    Simple keyword-based tool extraction as fallback when Ollama fails
125    """
126    if pd.isna(job_text) or job_text == "":
127        return []
128        
129    common_tools = [
130        "Python", "Java", "JavaScript", "TypeScript", "C#", "C++", "Ruby", "PHP", "Go", "Rust",
131        "AWS", "Azure", "GCP", "SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "Excel", "Word",
132        "PowerPoint", "PowerBI", "Tableau", "Looker", "Git", "Docker", "Kubernetes", "Jenkins",
133        "Jira", "Confluence", "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",
134        "TensorFlow", "PyTorch", "Hadoop", "Spark", "Kafka", "Airflow", "Linux", "Windows", "MacOS",
135        "SAP", "Salesforce", "ServiceNow", "Workday", "SharePoint", ".NET", "R", "MATLAB"
136    ]
137    
138    job_text_lower = job_text.lower()
139    found_tools = []
140    
141    for tool in common_tools:
142        if tool.lower() in job_text_lower or (tool == "JavaScript" and "js" in job_text_lower):
143            found_tools.append(tool)
144    
145    return found_tools
146
147def main():
148    print("✅ Script started")
149
150    file_path = "seek_jobs.csv"
151    output_path = "seek_jobs_with_tools.csv"
152    log_path = "extraction_log.txt"
153    
154    # Setup logging to file
155    import sys
156    original_stdout = sys.stdout
157    log_file = open(log_path, 'w')
158    
159    try:
160        print(f"Reading dataset from {file_path}...")
161        df = pd.read_csv(file_path)
162        print(f"📄 Data loaded: {len(df)} rows")
163
164        title_column = "job_title"
165        description_column = "description"
166        job_details_column = "job_details"
167        job_type_column = "job_type"
168
169        print(f"Total records: {len(df)}")
170        print("Starting tools extraction...\n")
171
172        all_tools = []
173        extracted_tools_str = []
174        used_fallback = []
175
176        processing_range = df
177
178        # Create a checkpoint system to save progress periodically
179        checkpoint_interval = 50
180        last_checkpoint = 0
181        checkpoint_file = "extraction_checkpoint.csv"
182        
183        # Load checkpoint if exists
184        import os
185        start_idx = 0
186        if os.path.exists(checkpoint_file):
187            checkpoint_df = pd.read_csv(checkpoint_file)
188            if 'extracted_tools_str' in checkpoint_df.columns:
189                extracted_so_far = checkpoint_df['extracted_tools_str'].notna().sum()
190                start_idx = extracted_so_far
191                print(f"Resuming from checkpoint at index {start_idx}")
192                # Copy already processed rows
193                extracted_tools_str = checkpoint_df['extracted_tools_str'].tolist()
194                used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_tools_str)
195            else:
196                extracted_tools_str = [None] * len(df)
197                used_fallback = [False] * len(df)
198        else:
199            extracted_tools_str = [None] * len(df)
200            used_fallback = [False] * len(df)
201
202        sys.stdout = log_file
203        for idx in range(start_idx, len(processing_range)):
204            row = processing_range.iloc[idx]
205            print(f"\nJob {idx + 1}/{len(processing_range)}:")
206
207            job_title = row[title_column] if title_column in df.columns else "Not specified"
208            print(f"Job Title: {job_title}")
209
210            combined_text = (
211                str(row.get(job_type_column, '')) + ' ' +
212                str(row.get(title_column, '')) + ' ' +
213                str(row.get(job_details_column, '')) + ' ' +
214                str(row.get(description_column, ''))
215            ).strip()
216
217            if combined_text:
218                print("Processing combined text:")
219                # Try with Ollama first
220                tools = extract_tools_with_ollama(combined_text)
221                
222                # If Ollama fails or returns empty, try fallback
223                if not tools:
224                    print("Ollama extraction failed or returned empty. Using fallback extraction.")
225                    tools = fallback_extract_tools(combined_text)
226                    used_fallback[idx] = True
227                    
228                all_tools.append(tools)
229                extracted_tools_str[idx] = (', '.join(tools) if tools else '')
230            else:
231                print("Combined text is empty for this job")
232                all_tools.append([])
233                extracted_tools_str[idx] = ''
234
235            print("-" * 50)
236            
237            # Save checkpoint periodically
238            if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):
239                print(f"Creating checkpoint at index {idx}")
240                temp_df = df.copy()
241                temp_df['extracted_tools_str'] = extracted_tools_str
242                temp_df['used_fallback'] = used_fallback
243                temp_df.to_csv(checkpoint_file, index=False)
244                
245            # Avoid overwhelming the local Ollama service
246            time.sleep(random.uniform(0.5, 1.5))
247
248        # Reset stdout and close log file
249        sys.stdout = original_stdout
250        log_file.close()
251        
252        df['extracted_tools_str'] = extracted_tools_str
253        df['used_fallback'] = used_fallback
254
255        df.to_csv(output_path, index=False)
256        print(f"\nUpdated dataset saved to {output_path}")
257
258        print("\n--- Sample Results ---")
259        print(df[[title_column, 'extracted_tools_str', 'used_fallback']].head(10))
260        
261        # Print summary statistics
262        total_jobs = len(df)
263        jobs_with_tools = sum(1 for tools in extracted_tools_str if tools)
264        jobs_using_fallback = sum(used_fallback)
265        
266        print(f"\n--- Summary Statistics ---")
267        print(f"Total jobs processed: {total_jobs}")
268        print(f"Jobs with extracted tools: {jobs_with_tools} ({jobs_with_tools/total_jobs*100:.1f}%)")
269        print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")
270        print(f"\nDetailed logs saved to {log_path}")
271        
272    except Exception as e:
273        sys.stdout = original_stdout
274        log_file.close()
275        print(f"Script error: {str(e)}")
276        import traceback
277        traceback.print_exc()
278
279if __name__ == "__main__":
280    main()

1import json 2import pandas as pd 3import requests 4import time 5import random 6from requests.exceptions import Timeout, ConnectionError 7 8def extract_tools_with_ollama(job_text, max_retries=3, timeout=60): 9 """ 10 Extract technical tools from job text using Ollama with Gemma model 11 with improved error handling and retries 12 """ 13 14 if pd.isna(job_text) or job_text == "": 15 return [] 16 17 job_text_lower = job_text.lower() 18 19 prompt = f""" 20 You are a specialized tool extractor for job descriptions. Your task is to identify ONLY specific named technical tools, technologies, software, programming languages, platforms, and frameworks. 21 22 STRICT RULES: 23 1. Extract ONLY proper noun names of specific technologies 24 2. Include ONLY: software names, programming languages, frameworks, platforms, databases, cloud services 25 3. Do NOT include: general skills, methodologies, concepts, or descriptive terms 26 4. Do NOT include adjectives or general terms like "analytical", "digital", "innovative" 27 5. IMPORTANT: ONLY extract tools that are EXPLICITLY mentioned in the text 28 6. If no specific tools are mentioned, return an empty array [] 29 30 CORRECT EXAMPLES: 31 - INCLUDE: Python, Java, AWS, Azure, Excel, SQL, PowerBI, Tableau, Git, Docker, React, TensorFlow 32 - DO NOT INCLUDE: analytical, technical, problem-solving, digital transformation, innovative 33 34 Job Text: 35 {job_text} 36 37 Respond ONLY with a JSON array containing the list of specific technical tools found. 38 Example response format: ["Python", "AWS", "Docker"] 39 """ 40 41 for attempt in range(max_retries): 42 try: 43 print(f"API call attempt {attempt + 1}/{max_retries}") 44 response = requests.post( 45 "http://localhost:11434/api/generate", 46 json={ 47 "model": "gemma3:1b", 48 "prompt": prompt, 49 "stream": False 50 }, 51 timeout=timeout # Increased timeout 52 ) 53 54 if response.status_code == 200: 55 result = response.json()["response"] 56 57 try: 58 start_idx = result.find('[') 59 end_idx = result.rfind(']') + 1 60 61 if start_idx >= 0 and end_idx > start_idx: 62 json_str = result[start_idx:end_idx] 63 tools = json.loads(json_str) 64 65 non_tools = ["analytical", "technical", "digital", "innovative", "advanced", 66 "solution", "problem-solving", "communication", "teamwork", 67 "methodology", "approach", "strategy", "skill"] 68 69 filtered_tools = [tool for tool in tools 70 if tool.lower() not in [nt.lower() for nt in non_tools] 71 and len(tool) > 1 72 and not tool.lower().endswith("ing")] 73 74 verified_tools = [] 75 for tool in filtered_tools: 76 if tool.lower() in job_text_lower: 77 verified_tools.append(tool) 78 elif tool.lower() == "javascript" and "js" in job_text_lower: 79 verified_tools.append(tool) 80 elif tool.lower() == "microsoft excel" and "excel" in job_text_lower: 81 verified_tools.append("Excel") 82 elif tool.lower() == "amazon web services" and "aws" in job_text_lower: 83 verified_tools.append("AWS") 84 85 print(f"Initial extraction: {tools}") 86 print(f"After verification: {verified_tools}") 87 return verified_tools 88 else: 89 print(f"Couldn't find JSON array in: {result}") 90 return [] 91 except json.JSONDecodeError: 92 print(f"Failed to parse JSON from: {result}") 93 return [] 94 else: 95 print(f"Error from Ollama API: {response.status_code} - {response.text}") 96 # If it's a server error, retry 97 if response.status_code >= 500: 98 if attempt < max_retries - 1: 99 # Add exponential backoff with jitter 100 backoff_time = (2 ** attempt) + random.uniform(0, 1) 101 print(f"Retrying in {backoff_time:.2f} seconds...") 102 time.sleep(backoff_time) 103 continue 104 return [] 105 106 except (Timeout, ConnectionError) as e: 107 print(f"Connection error on attempt {attempt + 1}: {str(e)}") 108 if attempt < max_retries - 1: 109 # Add exponential backoff with jitter 110 backoff_time = (2 ** attempt) + random.uniform(0, 1) 111 print(f"Retrying in {backoff_time:.2f} seconds...") 112 time.sleep(backoff_time) 113 else: 114 print(f"Max retries exceeded. Moving on without tools extraction.") 115 return [] 116 except Exception as e: 117 print(f"Unexpected exception: {str(e)}") 118 return [] 119 120 return [] 121 122def fallback_extract_tools(job_text): 123 """ 124 Simple keyword-based tool extraction as fallback when Ollama fails 125 """ 126 if pd.isna(job_text) or job_text == "": 127 return [] 128 129 common_tools = [ 130 "Python", "Java", "JavaScript", "TypeScript", "C#", "C++", "Ruby", "PHP", "Go", "Rust", 131 "AWS", "Azure", "GCP", "SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "Excel", "Word", 132 "PowerPoint", "PowerBI", "Tableau", "Looker", "Git", "Docker", "Kubernetes", "Jenkins", 133 "Jira", "Confluence", "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring", 134 "TensorFlow", "PyTorch", "Hadoop", "Spark", "Kafka", "Airflow", "Linux", "Windows", "MacOS", 135 "SAP", "Salesforce", "ServiceNow", "Workday", "SharePoint", ".NET", "R", "MATLAB" 136 ] 137 138 job_text_lower = job_text.lower() 139 found_tools = [] 140 141 for tool in common_tools: 142 if tool.lower() in job_text_lower or (tool == "JavaScript" and "js" in job_text_lower): 143 found_tools.append(tool) 144 145 return found_tools 146 147def main(): 148 print("✅ Script started") 149 150 file_path = "seek_jobs.csv" 151 output_path = "seek_jobs_with_tools.csv" 152 log_path = "extraction_log.txt" 153 154 # Setup logging to file 155 import sys 156 original_stdout = sys.stdout 157 log_file = open(log_path, 'w') 158 159 try: 160 print(f"Reading dataset from {file_path}...") 161 df = pd.read_csv(file_path) 162 print(f"📄 Data loaded: {len(df)} rows") 163 164 title_column = "job_title" 165 description_column = "description" 166 job_details_column = "job_details" 167 job_type_column = "job_type" 168 169 print(f"Total records: {len(df)}") 170 print("Starting tools extraction...\n") 171 172 all_tools = [] 173 extracted_tools_str = [] 174 used_fallback = [] 175 176 processing_range = df 177 178 # Create a checkpoint system to save progress periodically 179 checkpoint_interval = 50 180 last_checkpoint = 0 181 checkpoint_file = "extraction_checkpoint.csv" 182 183 # Load checkpoint if exists 184 import os 185 start_idx = 0 186 if os.path.exists(checkpoint_file): 187 checkpoint_df = pd.read_csv(checkpoint_file) 188 if 'extracted_tools_str' in checkpoint_df.columns: 189 extracted_so_far = checkpoint_df['extracted_tools_str'].notna().sum() 190 start_idx = extracted_so_far 191 print(f"Resuming from checkpoint at index {start_idx}") 192 # Copy already processed rows 193 extracted_tools_str = checkpoint_df['extracted_tools_str'].tolist() 194 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_tools_str) 195 else: 196 extracted_tools_str = [None] * len(df) 197 used_fallback = [False] * len(df) 198 else: 199 extracted_tools_str = [None] * len(df) 200 used_fallback = [False] * len(df) 201 202 sys.stdout = log_file 203 for idx in range(start_idx, len(processing_range)): 204 row = processing_range.iloc[idx] 205 print(f"\nJob {idx + 1}/{len(processing_range)}:") 206 207 job_title = row[title_column] if title_column in df.columns else "Not specified" 208 print(f"Job Title: {job_title}") 209 210 combined_text = ( 211 str(row.get(job_type_column, '')) + ' ' + 212 str(row.get(title_column, '')) + ' ' + 213 str(row.get(job_details_column, '')) + ' ' + 214 str(row.get(description_column, '')) 215 ).strip() 216 217 if combined_text: 218 print("Processing combined text:") 219 # Try with Ollama first 220 tools = extract_tools_with_ollama(combined_text) 221 222 # If Ollama fails or returns empty, try fallback 223 if not tools: 224 print("Ollama extraction failed or returned empty. Using fallback extraction.") 225 tools = fallback_extract_tools(combined_text) 226 used_fallback[idx] = True 227 228 all_tools.append(tools) 229 extracted_tools_str[idx] = (', '.join(tools) if tools else '') 230 else: 231 print("Combined text is empty for this job") 232 all_tools.append([]) 233 extracted_tools_str[idx] = '' 234 235 print("-" * 50) 236 237 # Save checkpoint periodically 238 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range): 239 print(f"Creating checkpoint at index {idx}") 240 temp_df = df.copy() 241 temp_df['extracted_tools_str'] = extracted_tools_str 242 temp_df['used_fallback'] = used_fallback 243 temp_df.to_csv(checkpoint_file, index=False) 244 245 # Avoid overwhelming the local Ollama service 246 time.sleep(random.uniform(0.5, 1.5)) 247 248 # Reset stdout and close log file 249 sys.stdout = original_stdout 250 log_file.close() 251 252 df['extracted_tools_str'] = extracted_tools_str 253 df['used_fallback'] = used_fallback 254 255 df.to_csv(output_path, index=False) 256 print(f"\nUpdated dataset saved to {output_path}") 257 258 print("\n--- Sample Results ---") 259 print(df[[title_column, 'extracted_tools_str', 'used_fallback']].head(10)) 260 261 # Print summary statistics 262 total_jobs = len(df) 263 jobs_with_tools = sum(1 for tools in extracted_tools_str if tools) 264 jobs_using_fallback = sum(used_fallback) 265 266 print(f"\n--- Summary Statistics ---") 267 print(f"Total jobs processed: {total_jobs}") 268 print(f"Jobs with extracted tools: {jobs_with_tools} ({jobs_with_tools/total_jobs*100:.1f}%)") 269 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)") 270 print(f"\nDetailed logs saved to {log_path}") 271 272 except Exception as e: 273 sys.stdout = original_stdout 274 log_file.close() 275 print(f"Script error: {str(e)}") 276 import traceback 277 traceback.print_exc() 278 279if __name__ == "__main__": 280 main()

Tool extraction and saving data into csv file(Source code)

Next Up

Lesson 31: Soft skill extraction and saving data into csv file(Source code)

Next Up

Lesson 31: Soft skill extraction and saving data into csv file(Source code)