AusBiz Consulting

✅ Compared to previous lesson no. 26, this script improves on soft skill extraction by verifying results, saving it in seek_jobs_with_skills.csv and handling AI/API failures gracefully.

✅ Extracts soft skills from job descriptions using Ollama's Gemma model, with strict filtering and synonym matching.

✅ Implements retry logic, timeout handling, and fallbacks to keyword-based extraction if AI fails.

✅ Checks for presence of extracted soft skills (or their variations) in the original job text to ensure accuracy.

✅ Uses checkpointing to resume progress and prevent data loss during large batch processing.

✅ Automatically saves progress to a checkpoint file after every 50 job descriptions processed to ensure intermediate results are not lost.

Python

1import json
2import pandas as pd
3import requests
4import time
5import random
6from requests.exceptions import Timeout, ConnectionError
7
8def extract_soft_skills_with_ollama(job_description, max_retries=3, timeout=60):
9    """
10    Extract soft skills from job description using Ollama with Gemma model
11    with improved error handling and retries
12    """
13    
14    if pd.isna(job_description) or job_description == "":
15        return []
16    
17    job_description_lower = job_description.lower()
18    
19    prompt = f"""
20    You are a specialized soft skills extractor for job descriptions. Your task is to identify ONLY soft skills, interpersonal abilities, and personal attributes mentioned in the job description.
21
22    STRICT RULES:
23    1. Extract ONLY soft skills and interpersonal abilities
24    2. Include ONLY: communication skills, leadership qualities, interpersonal traits, personal attributes
25    3. Do NOT include: technical skills, tools, programming languages, or job requirements
26    4. Do NOT include job titles, responsibilities, or education requirements
27    5. IMPORTANT: ONLY extract soft skills that are EXPLICITLY mentioned in the text
28    6. If no soft skills are mentioned, return an empty array []
29
30    CORRECT EXAMPLES:
31    - INCLUDE: communication, teamwork, leadership, problem-solving, adaptability, time management, creativity, critical thinking
32    - DO NOT INCLUDE: Python, project management (as a role), BA degree, Excel, analytics (as a field)
33
34    Job Description:
35    {job_description}
36
37    Respond ONLY with a JSON array containing the list of soft skills found.
38    Example response format: ["Communication", "Teamwork", "Leadership"]
39    """
40    
41    # Make request to local Ollama API with retry logic
42    for attempt in range(max_retries):
43        try:
44            print(f"API call attempt {attempt + 1}/{max_retries}")
45            response = requests.post(
46                "http://localhost:11434/api/generate",
47                json={
48                    "model": "gemma3:1b",
49                    "prompt": prompt,
50                    "stream": False
51                },
52                timeout=timeout  # Increased timeout
53            )
54            
55            if response.status_code == 200:
56                result = response.json()["response"]
57                
58                # Extract JSON array from the response
59                try:
60                    # Try to find JSON array in the response
61                    start_idx = result.find('[')
62                    end_idx = result.rfind(']') + 1
63                    
64                    if start_idx >= 0 and end_idx > start_idx:
65                        json_str = result[start_idx:end_idx]
66                        soft_skills = json.loads(json_str)
67                        
68                        # Additional filtering to remove non-soft skill items
69                        non_soft_skills = ["python", "java", "excel", "sql", "aws", "azure", "tableau", 
70                                          "powerbi", "data", "analytics", "database", "programming",
71                                          "degree", "bachelor", "master", "certification", "license"]
72                        
73                        # Filter out any items that are likely not soft skills
74                        filtered_skills = [skill for skill in soft_skills 
75                                          if skill.lower() not in [ns.lower() for ns in non_soft_skills]
76                                          and len(skill) > 1]  # Avoid single characters
77                        
78                        # CRITICAL: Verify each soft skill actually appears in the job description
79                        verified_skills = []
80                        
81                        # Common variations and synonyms for soft skills
82                        skill_synonyms = {
83                            "communication": ["communicate", "verbal", "written communication", "articulate", "presenting"],
84                            "leadership": ["lead", "leading", "leader", "motivate", "influence"],
85                            "teamwork": ["team player", "collaborate", "collaboration", "team-oriented", "team work"],
86                            "adaptability": ["adapt", "flexible", "flexibility", "versatile", "versatility"],
87                            "problem-solving": ["solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"],
88                            "time management": ["prioritize", "prioritization", "deadlines", "punctual", "time-conscious"],
89                            "creativity": ["creative", "innovative", "innovation", "creative thinking", "think outside the box"],
90                            "critical thinking": ["analytical", "analyze", "critical", "logical", "reasoning"]
91                        }
92                        
93                        for skill in filtered_skills:
94                            skill_lower = skill.lower()
95                            
96                            # Check if the skill directly appears in the job description
97                            if skill_lower in job_description_lower:
98                                verified_skills.append(skill)
99                                continue
100                            
101                            # Check for synonyms and variations
102                            for main_skill, synonyms in skill_synonyms.items():
103                                if skill_lower == main_skill or skill_lower in synonyms:
104                                    # Check if any synonyms appear in the text
105                                    if any(syn in job_description_lower for syn in synonyms) or main_skill in job_description_lower:
106                                        verified_skills.append(main_skill.title())  # Use standardized version
107                                        break
108                        
109                        print(f"Initial extraction: {soft_skills}")
110                        print(f"After verification: {verified_skills}")
111                        return verified_skills
112                    else:
113                        print(f"Couldn't find JSON array in: {result}")
114                        return []
115                except json.JSONDecodeError:
116                    print(f"Failed to parse JSON from: {result}")
117                    return []
118            else:
119                print(f"Error from Ollama API: {response.status_code} - {response.text}")
120                # If it's a server error, retry
121                if response.status_code >= 500:
122                    if attempt < max_retries - 1:
123                        # Add exponential backoff with jitter
124                        backoff_time = (2 ** attempt) + random.uniform(0, 1)
125                        print(f"Retrying in {backoff_time:.2f} seconds...")
126                        time.sleep(backoff_time)
127                        continue
128                return []
129        except (Timeout, ConnectionError) as e:
130            print(f"Connection error on attempt {attempt + 1}: {str(e)}")
131            if attempt < max_retries - 1:
132                # Add exponential backoff with jitter
133                backoff_time = (2 ** attempt) + random.uniform(0, 1)
134                print(f"Retrying in {backoff_time:.2f} seconds...")
135                time.sleep(backoff_time)
136            else:
137                print(f"Max retries exceeded. Moving on without skills extraction.")
138                return []
139        except Exception as e:
140            print(f"Unexpected exception: {str(e)}")
141            return []
142    
143    return []
144
145def fallback_extract_soft_skills(job_description):
146    """
147    Simple keyword-based soft skills extraction as fallback when Ollama fails
148    """
149    if pd.isna(job_description) or job_description == "":
150        return []
151    
152    # List of common soft skills to look for
153    common_soft_skills = {
154        "Communication": ["communication", "communicate", "verbal", "written", "articulate", "presenting", "presentation"],
155        "Teamwork": ["teamwork", "team player", "collaborate", "collaboration", "team-oriented", "team work"],
156        "Leadership": ["leadership", "lead", "leading", "leader", "motivate", "influence"],
157        "Problem-solving": ["problem-solving", "solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"],
158        "Adaptability": ["adaptability", "adapt", "flexible", "flexibility", "versatile", "versatility"],
159        "Time Management": ["time management", "prioritize", "prioritization", "deadlines", "punctual", "time-conscious"],
160        "Creativity": ["creativity", "creative", "innovative", "innovation", "creative thinking", "think outside the box"],
161        "Critical Thinking": ["critical thinking", "analytical", "analyze", "critical", "logical", "reasoning"],
162        "Attention to Detail": ["attention to detail", "detail-oriented", "meticulous", "thorough", "precise", "accuracy"],
163        "Interpersonal Skills": ["interpersonal", "people skills", "rapport", "relationship building"],
164        "Organizational Skills": ["organizational", "organized", "organization", "structure", "systematic"],
165        "Self-motivation": ["self-motivated", "self-starter", "initiative", "proactive", "driven", "ambitious"],
166        "Conflict Resolution": ["conflict resolution", "mediation", "dispute", "negotiation", "diplomacy"],
167        "Decision Making": ["decision making", "decisive", "judgment", "discernment"],
168        "Emotional Intelligence": ["emotional intelligence", "empathy", "self-awareness", "social awareness"],
169        "Resilience": ["resilience", "resilient", "perseverance", "tenacity", "determination"],
170        "Customer Service": ["customer service", "client-focused", "customer-oriented", "service-minded"]
171    }
172    
173    job_description_lower = job_description.lower()
174    found_skills = []
175    
176    for skill, keywords in common_soft_skills.items():
177        for keyword in keywords:
178            if keyword.lower() in job_description_lower:
179                found_skills.append(skill)
180                break  # Only add the skill once if any keyword is found
181    
182    return found_skills
183
184def main():
185    print("✅ Script started")
186
187    file_path = "seek_jobs.csv"
188    output_path = "seek_jobs_with_skills.csv"
189    log_path = "skills_extraction_log.txt"
190    
191    # Setup logging to file
192    import sys
193    original_stdout = sys.stdout
194    log_file = open(log_path, 'w')
195    
196    try:
197        print(f"Reading dataset from {file_path}...")
198        df = pd.read_csv(file_path)
199        print(f"📄 Data loaded: {len(df)} rows")
200
201        # Columns to extract soft skills from
202        title_column = "job_title"  # Column containing job titles
203        description_column = "description"
204        job_details_column = "job_details"
205        job_type_column = "job_type"
206
207        print(f"Total records: {len(df)}")
208        print("Starting soft skills extraction...\n")
209
210        all_soft_skills = []
211        extracted_soft_skills_str = []
212        used_fallback = []
213
214        processing_range = df
215
216        # Create a checkpoint system to save progress periodically
217        checkpoint_interval = 50
218        last_checkpoint = 0
219        checkpoint_file = "skills_extraction_checkpoint.csv"
220        
221        # Load checkpoint if exists
222        import os
223        start_idx = 0
224        if os.path.exists(checkpoint_file):
225            checkpoint_df = pd.read_csv(checkpoint_file)
226            if 'extracted_soft_skills_str' in checkpoint_df.columns:
227                extracted_so_far = checkpoint_df['extracted_soft_skills_str'].notna().sum()
228                start_idx = extracted_so_far
229                print(f"Resuming from checkpoint at index {start_idx}")
230                # Copy already processed rows
231                extracted_soft_skills_str = checkpoint_df['extracted_soft_skills_str'].tolist()
232                used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_soft_skills_str)
233            else:
234                extracted_soft_skills_str = [None] * len(df)
235                used_fallback = [False] * len(df)
236        else:
237            extracted_soft_skills_str = [None] * len(df)
238            used_fallback = [False] * len(df)
239
240        sys.stdout = log_file
241        for idx in range(start_idx, len(processing_range)):
242            row = processing_range.iloc[idx]
243            print(f"\nJob {idx + 1}/{len(processing_range)}:")
244
245            job_title = row[title_column] if title_column in df.columns else "Not specified"
246            print(f"Job Title: {job_title}")
247
248            # Combine relevant columns for more context
249            combined_text = (
250                str(row.get(job_type_column, '')) + ' ' +
251                str(row.get(title_column, '')) + ' ' +
252                str(row.get(job_details_column, '')) + ' ' +
253                str(row.get(description_column, ''))
254            ).strip()
255
256            if combined_text:
257                print("Processing combined text:")
258                # Try with Ollama first
259                soft_skills = extract_soft_skills_with_ollama(combined_text)
260                
261                # If Ollama fails or returns empty, try fallback
262                if not soft_skills:
263                    print("Ollama extraction failed or returned empty. Using fallback extraction.")
264                    soft_skills = fallback_extract_soft_skills(combined_text)
265                    used_fallback[idx] = True
266                    
267                all_soft_skills.append(soft_skills)
268                extracted_soft_skills_str[idx] = (', '.join(soft_skills) if soft_skills else '')
269            else:
270                print("Combined text is empty for this job")
271                all_soft_skills.append([])
272                extracted_soft_skills_str[idx] = ''
273
274            print("-" * 50)
275            
276            # Save checkpoint periodically
277            if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):
278                print(f"Creating checkpoint at index {idx}")
279                temp_df = df.copy()
280                temp_df['extracted_soft_skills_str'] = extracted_soft_skills_str
281                temp_df['used_fallback'] = used_fallback
282                temp_df.to_csv(checkpoint_file, index=False)
283                
284            # Avoid overwhelming the local Ollama service
285            time.sleep(random.uniform(0.5, 1.5))
286
287        # Reset stdout and close log file
288        sys.stdout = original_stdout
289        log_file.close()
290        
291        df['extracted_soft_skills_str'] = extracted_soft_skills_str
292        df['used_fallback'] = used_fallback
293
294        df.to_csv(output_path, index=False)
295        print(f"\nUpdated dataset saved to {output_path}")
296
297        print("\n--- Sample Results ---")
298        print(df[[title_column, 'extracted_soft_skills_str', 'used_fallback']].head(10))
299        
300        # Print summary statistics
301        total_jobs = len(df)
302        jobs_with_skills = sum(1 for skills in extracted_soft_skills_str if skills)
303        jobs_using_fallback = sum(used_fallback)
304        
305        print(f"\n--- Summary Statistics ---")
306        print(f"Total jobs processed: {total_jobs}")
307        print(f"Jobs with extracted soft skills: {jobs_with_skills} ({jobs_with_skills/total_jobs*100:.1f}%)")
308        print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")
309        print(f"\nDetailed logs saved to {log_path}")
310        
311    except Exception as e:
312        sys.stdout = original_stdout
313        log_file.close()
314        print(f"Script error: {str(e)}")
315        import traceback
316        traceback.print_exc()
317
318if __name__ == "__main__":
319    main()

1import json 2import pandas as pd 3import requests 4import time 5import random 6from requests.exceptions import Timeout, ConnectionError 7 8def extract_soft_skills_with_ollama(job_description, max_retries=3, timeout=60): 9 """ 10 Extract soft skills from job description using Ollama with Gemma model 11 with improved error handling and retries 12 """ 13 14 if pd.isna(job_description) or job_description == "": 15 return [] 16 17 job_description_lower = job_description.lower() 18 19 prompt = f""" 20 You are a specialized soft skills extractor for job descriptions. Your task is to identify ONLY soft skills, interpersonal abilities, and personal attributes mentioned in the job description. 21 22 STRICT RULES: 23 1. Extract ONLY soft skills and interpersonal abilities 24 2. Include ONLY: communication skills, leadership qualities, interpersonal traits, personal attributes 25 3. Do NOT include: technical skills, tools, programming languages, or job requirements 26 4. Do NOT include job titles, responsibilities, or education requirements 27 5. IMPORTANT: ONLY extract soft skills that are EXPLICITLY mentioned in the text 28 6. If no soft skills are mentioned, return an empty array [] 29 30 CORRECT EXAMPLES: 31 - INCLUDE: communication, teamwork, leadership, problem-solving, adaptability, time management, creativity, critical thinking 32 - DO NOT INCLUDE: Python, project management (as a role), BA degree, Excel, analytics (as a field) 33 34 Job Description: 35 {job_description} 36 37 Respond ONLY with a JSON array containing the list of soft skills found. 38 Example response format: ["Communication", "Teamwork", "Leadership"] 39 """ 40 41 # Make request to local Ollama API with retry logic 42 for attempt in range(max_retries): 43 try: 44 print(f"API call attempt {attempt + 1}/{max_retries}") 45 response = requests.post( 46 "http://localhost:11434/api/generate", 47 json={ 48 "model": "gemma3:1b", 49 "prompt": prompt, 50 "stream": False 51 }, 52 timeout=timeout # Increased timeout 53 ) 54 55 if response.status_code == 200: 56 result = response.json()["response"] 57 58 # Extract JSON array from the response 59 try: 60 # Try to find JSON array in the response 61 start_idx = result.find('[') 62 end_idx = result.rfind(']') + 1 63 64 if start_idx >= 0 and end_idx > start_idx: 65 json_str = result[start_idx:end_idx] 66 soft_skills = json.loads(json_str) 67 68 # Additional filtering to remove non-soft skill items 69 non_soft_skills = ["python", "java", "excel", "sql", "aws", "azure", "tableau", 70 "powerbi", "data", "analytics", "database", "programming", 71 "degree", "bachelor", "master", "certification", "license"] 72 73 # Filter out any items that are likely not soft skills 74 filtered_skills = [skill for skill in soft_skills 75 if skill.lower() not in [ns.lower() for ns in non_soft_skills] 76 and len(skill) > 1] # Avoid single characters 77 78 # CRITICAL: Verify each soft skill actually appears in the job description 79 verified_skills = [] 80 81 # Common variations and synonyms for soft skills 82 skill_synonyms = { 83 "communication": ["communicate", "verbal", "written communication", "articulate", "presenting"], 84 "leadership": ["lead", "leading", "leader", "motivate", "influence"], 85 "teamwork": ["team player", "collaborate", "collaboration", "team-oriented", "team work"], 86 "adaptability": ["adapt", "flexible", "flexibility", "versatile", "versatility"], 87 "problem-solving": ["solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"], 88 "time management": ["prioritize", "prioritization", "deadlines", "punctual", "time-conscious"], 89 "creativity": ["creative", "innovative", "innovation", "creative thinking", "think outside the box"], 90 "critical thinking": ["analytical", "analyze", "critical", "logical", "reasoning"] 91 } 92 93 for skill in filtered_skills: 94 skill_lower = skill.lower() 95 96 # Check if the skill directly appears in the job description 97 if skill_lower in job_description_lower: 98 verified_skills.append(skill) 99 continue 100 101 # Check for synonyms and variations 102 for main_skill, synonyms in skill_synonyms.items(): 103 if skill_lower == main_skill or skill_lower in synonyms: 104 # Check if any synonyms appear in the text 105 if any(syn in job_description_lower for syn in synonyms) or main_skill in job_description_lower: 106 verified_skills.append(main_skill.title()) # Use standardized version 107 break 108 109 print(f"Initial extraction: {soft_skills}") 110 print(f"After verification: {verified_skills}") 111 return verified_skills 112 else: 113 print(f"Couldn't find JSON array in: {result}") 114 return [] 115 except json.JSONDecodeError: 116 print(f"Failed to parse JSON from: {result}") 117 return [] 118 else: 119 print(f"Error from Ollama API: {response.status_code} - {response.text}") 120 # If it's a server error, retry 121 if response.status_code >= 500: 122 if attempt < max_retries - 1: 123 # Add exponential backoff with jitter 124 backoff_time = (2 ** attempt) + random.uniform(0, 1) 125 print(f"Retrying in {backoff_time:.2f} seconds...") 126 time.sleep(backoff_time) 127 continue 128 return [] 129 except (Timeout, ConnectionError) as e: 130 print(f"Connection error on attempt {attempt + 1}: {str(e)}") 131 if attempt < max_retries - 1: 132 # Add exponential backoff with jitter 133 backoff_time = (2 ** attempt) + random.uniform(0, 1) 134 print(f"Retrying in {backoff_time:.2f} seconds...") 135 time.sleep(backoff_time) 136 else: 137 print(f"Max retries exceeded. Moving on without skills extraction.") 138 return [] 139 except Exception as e: 140 print(f"Unexpected exception: {str(e)}") 141 return [] 142 143 return [] 144 145def fallback_extract_soft_skills(job_description): 146 """ 147 Simple keyword-based soft skills extraction as fallback when Ollama fails 148 """ 149 if pd.isna(job_description) or job_description == "": 150 return [] 151 152 # List of common soft skills to look for 153 common_soft_skills = { 154 "Communication": ["communication", "communicate", "verbal", "written", "articulate", "presenting", "presentation"], 155 "Teamwork": ["teamwork", "team player", "collaborate", "collaboration", "team-oriented", "team work"], 156 "Leadership": ["leadership", "lead", "leading", "leader", "motivate", "influence"], 157 "Problem-solving": ["problem-solving", "solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"], 158 "Adaptability": ["adaptability", "adapt", "flexible", "flexibility", "versatile", "versatility"], 159 "Time Management": ["time management", "prioritize", "prioritization", "deadlines", "punctual", "time-conscious"], 160 "Creativity": ["creativity", "creative", "innovative", "innovation", "creative thinking", "think outside the box"], 161 "Critical Thinking": ["critical thinking", "analytical", "analyze", "critical", "logical", "reasoning"], 162 "Attention to Detail": ["attention to detail", "detail-oriented", "meticulous", "thorough", "precise", "accuracy"], 163 "Interpersonal Skills": ["interpersonal", "people skills", "rapport", "relationship building"], 164 "Organizational Skills": ["organizational", "organized", "organization", "structure", "systematic"], 165 "Self-motivation": ["self-motivated", "self-starter", "initiative", "proactive", "driven", "ambitious"], 166 "Conflict Resolution": ["conflict resolution", "mediation", "dispute", "negotiation", "diplomacy"], 167 "Decision Making": ["decision making", "decisive", "judgment", "discernment"], 168 "Emotional Intelligence": ["emotional intelligence", "empathy", "self-awareness", "social awareness"], 169 "Resilience": ["resilience", "resilient", "perseverance", "tenacity", "determination"], 170 "Customer Service": ["customer service", "client-focused", "customer-oriented", "service-minded"] 171 } 172 173 job_description_lower = job_description.lower() 174 found_skills = [] 175 176 for skill, keywords in common_soft_skills.items(): 177 for keyword in keywords: 178 if keyword.lower() in job_description_lower: 179 found_skills.append(skill) 180 break # Only add the skill once if any keyword is found 181 182 return found_skills 183 184def main(): 185 print("✅ Script started") 186 187 file_path = "seek_jobs.csv" 188 output_path = "seek_jobs_with_skills.csv" 189 log_path = "skills_extraction_log.txt" 190 191 # Setup logging to file 192 import sys 193 original_stdout = sys.stdout 194 log_file = open(log_path, 'w') 195 196 try: 197 print(f"Reading dataset from {file_path}...") 198 df = pd.read_csv(file_path) 199 print(f"📄 Data loaded: {len(df)} rows") 200 201 # Columns to extract soft skills from 202 title_column = "job_title" # Column containing job titles 203 description_column = "description" 204 job_details_column = "job_details" 205 job_type_column = "job_type" 206 207 print(f"Total records: {len(df)}") 208 print("Starting soft skills extraction...\n") 209 210 all_soft_skills = [] 211 extracted_soft_skills_str = [] 212 used_fallback = [] 213 214 processing_range = df 215 216 # Create a checkpoint system to save progress periodically 217 checkpoint_interval = 50 218 last_checkpoint = 0 219 checkpoint_file = "skills_extraction_checkpoint.csv" 220 221 # Load checkpoint if exists 222 import os 223 start_idx = 0 224 if os.path.exists(checkpoint_file): 225 checkpoint_df = pd.read_csv(checkpoint_file) 226 if 'extracted_soft_skills_str' in checkpoint_df.columns: 227 extracted_so_far = checkpoint_df['extracted_soft_skills_str'].notna().sum() 228 start_idx = extracted_so_far 229 print(f"Resuming from checkpoint at index {start_idx}") 230 # Copy already processed rows 231 extracted_soft_skills_str = checkpoint_df['extracted_soft_skills_str'].tolist() 232 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_soft_skills_str) 233 else: 234 extracted_soft_skills_str = [None] * len(df) 235 used_fallback = [False] * len(df) 236 else: 237 extracted_soft_skills_str = [None] * len(df) 238 used_fallback = [False] * len(df) 239 240 sys.stdout = log_file 241 for idx in range(start_idx, len(processing_range)): 242 row = processing_range.iloc[idx] 243 print(f"\nJob {idx + 1}/{len(processing_range)}:") 244 245 job_title = row[title_column] if title_column in df.columns else "Not specified" 246 print(f"Job Title: {job_title}") 247 248 # Combine relevant columns for more context 249 combined_text = ( 250 str(row.get(job_type_column, '')) + ' ' + 251 str(row.get(title_column, '')) + ' ' + 252 str(row.get(job_details_column, '')) + ' ' + 253 str(row.get(description_column, '')) 254 ).strip() 255 256 if combined_text: 257 print("Processing combined text:") 258 # Try with Ollama first 259 soft_skills = extract_soft_skills_with_ollama(combined_text) 260 261 # If Ollama fails or returns empty, try fallback 262 if not soft_skills: 263 print("Ollama extraction failed or returned empty. Using fallback extraction.") 264 soft_skills = fallback_extract_soft_skills(combined_text) 265 used_fallback[idx] = True 266 267 all_soft_skills.append(soft_skills) 268 extracted_soft_skills_str[idx] = (', '.join(soft_skills) if soft_skills else '') 269 else: 270 print("Combined text is empty for this job") 271 all_soft_skills.append([]) 272 extracted_soft_skills_str[idx] = '' 273 274 print("-" * 50) 275 276 # Save checkpoint periodically 277 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range): 278 print(f"Creating checkpoint at index {idx}") 279 temp_df = df.copy() 280 temp_df['extracted_soft_skills_str'] = extracted_soft_skills_str 281 temp_df['used_fallback'] = used_fallback 282 temp_df.to_csv(checkpoint_file, index=False) 283 284 # Avoid overwhelming the local Ollama service 285 time.sleep(random.uniform(0.5, 1.5)) 286 287 # Reset stdout and close log file 288 sys.stdout = original_stdout 289 log_file.close() 290 291 df['extracted_soft_skills_str'] = extracted_soft_skills_str 292 df['used_fallback'] = used_fallback 293 294 df.to_csv(output_path, index=False) 295 print(f"\nUpdated dataset saved to {output_path}") 296 297 print("\n--- Sample Results ---") 298 print(df[[title_column, 'extracted_soft_skills_str', 'used_fallback']].head(10)) 299 300 # Print summary statistics 301 total_jobs = len(df) 302 jobs_with_skills = sum(1 for skills in extracted_soft_skills_str if skills) 303 jobs_using_fallback = sum(used_fallback) 304 305 print(f"\n--- Summary Statistics ---") 306 print(f"Total jobs processed: {total_jobs}") 307 print(f"Jobs with extracted soft skills: {jobs_with_skills} ({jobs_with_skills/total_jobs*100:.1f}%)") 308 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)") 309 print(f"\nDetailed logs saved to {log_path}") 310 311 except Exception as e: 312 sys.stdout = original_stdout 313 log_file.close() 314 print(f"Script error: {str(e)}") 315 import traceback 316 traceback.print_exc() 317 318if __name__ == "__main__": 319 main()

Soft skill extraction and saving data into csv file(Source code)

Next Up

Lesson 32: Methods and Framework extraction and saving data into csv file(Source code)

Next Up

Lesson 32: Methods and Framework extraction and saving data into csv file(Source code)