AusBiz Consulting

✅ Compared to previous lesson no. 27, this script improves on soft skill extraction by verifying results, saving it in seek_jobs_with_methods_frameworks.csv and handling AI/API failures gracefully.

✅ Extracts project management methods and frameworks using Ollama's Gemma model, with strict rules and synonym-based validation.

✅ Includes robust retry logic, timeout handling, and a fallback keyword-based extractor to handle LLM/API failures.

✅ Verifies extracted items by checking for their exact or synonymous presence in the job description text.

✅ Automatically saves progress to a checkpoint file after every 50 job descriptions processed to ensure intermediate results are not lost.

Python

1import json
2import pandas as pd
3import requests
4import time
5import random
6from requests.exceptions import Timeout, ConnectionError
7
8def extract_methods_frameworks_with_ollama(job_description, max_retries=3, timeout=60):
9    """
10    Extract methods and frameworks from job description using Ollama with Gemma model
11    with improved error handling and retries
12    """
13    # Handle missing or NaN values
14    if pd.isna(job_description) or job_description == "":
15        return []
16    
17    job_description_lower = job_description.lower()
18    
19    prompt = f"""
20    You are a specialized methods and frameworks extractor for job descriptions. Your task is to identify ONLY project management methodologies, development frameworks, and working approaches mentioned in the job description.
21
22    STRICT RULES:
23    1. Extract ONLY methods, frameworks, and methodologies 
24    2. Include ONLY: project methodologies, development frameworks, process frameworks, working approaches
25    3. Examples include: Scrum, Agile, Kanban, Waterfall, Lean, Six Sigma, DevOps, SAFe, PRINCE2, ITIL, XP (Extreme Programming)
26    4. Do NOT include: programming languages, job titles, soft skills, or technical tools (unless they are actual methodologies)
27    5. IMPORTANT: ONLY extract methods/frameworks that are EXPLICITLY mentioned in the text
28    6. If no methods or frameworks are mentioned, return an empty array []
29
30    CORRECT EXAMPLES:
31    - INCLUDE: Scrum, Agile, Kanban, Waterfall, Lean, Design Thinking, OKRs, PRINCE2
32    - DO NOT INCLUDE: Python, Excel, leadership, communication, teamwork, SQL
33
34    Job Description:
35    {job_description}
36
37    Respond ONLY with a JSON array containing the list of methods and frameworks found.
38    Example response format: ["Agile", "Scrum", "Kanban"]
39    """
40    
41    # Make request to local Ollama API with retries
42    for attempt in range(max_retries):
43        try:
44            print(f"API call attempt {attempt + 1}/{max_retries}")
45            response = requests.post(
46                "http://localhost:11434/api/generate",
47                json={
48                    "model": "gemma3:1b",
49                    "prompt": prompt,
50                    "stream": False
51                },
52                timeout=timeout  # Increased timeout
53            )
54            
55            if response.status_code == 200:
56                result = response.json()["response"]
57                
58                # Extract JSON array from the response
59                try:
60                    # Try to find JSON array in the response
61                    start_idx = result.find('[')
62                    end_idx = result.rfind(']') + 1
63                    
64                    if start_idx >= 0 and end_idx > start_idx:
65                        json_str = result[start_idx:end_idx]
66                        methods_frameworks = json.loads(json_str)
67                        
68                        # Additional filtering to remove non-methodology items
69                        non_methods = ["python", "java", "excel", "sql", "leadership", "communication", 
70                                      "teamwork", "degree", "bachelor", "master", "certification"]
71                        
72                        # Filter out any items that are likely not methods/frameworks
73                        filtered_methods = [method for method in methods_frameworks 
74                                          if method.lower() not in [nm.lower() for nm in non_methods]
75                                          and len(method) > 1]  # Avoid single characters
76                        
77                        # CRITICAL: Verify each method/framework actually appears in the job description
78                        verified_methods = []
79                        
80                        # Common variations and synonyms for methods and frameworks
81                        method_synonyms = {
82                            "agile": ["agile methodology", "agile environment", "agile principles", "agile practices"],
83                            "scrum": ["scrum master", "scrum team", "scrum methodology", "scrum framework"],
84                            "kanban": ["kanban board", "kanban method", "kanban methodology"],
85                            "waterfall": ["waterfall model", "waterfall methodology", "waterfall approach"],
86                            "lean": ["lean methodology", "lean principles", "lean thinking", "lean management"],
87                            "six sigma": ["6 sigma", "six-sigma", "6-sigma", "lean six sigma"],
88                            "devops": ["dev ops", "dev-ops", "devops practices", "devops culture"],
89                            "safe": ["scaled agile framework", "safe framework", "safe methodology"],
90                            "prince2": ["prince 2", "prince-2", "projects in controlled environments"],
91                            "itil": ["information technology infrastructure library", "itil framework", "itil processes"],
92                            "extreme programming": ["xp", "xp methodology", "extreme programming practices"],
93                            "design thinking": ["design-thinking", "design thinking methodology"],
94                            "okrs": ["objectives and key results", "okr framework", "okr methodology"]
95                        }
96                        
97                        for method in filtered_methods:
98                            method_lower = method.lower()
99                            
100                            # Check if the method directly appears in the job description
101                            if method_lower in job_description_lower:
102                                verified_methods.append(method)
103                                continue
104                            
105                            # Check for synonyms and variations
106                            for main_method, synonyms in method_synonyms.items():
107                                if method_lower == main_method or method_lower in synonyms:
108                                    # Check if any synonyms appear in the text
109                                    if any(syn in job_description_lower for syn in synonyms) or main_method in job_description_lower:
110                                        verified_methods.append(main_method.title())  # Use standardized version
111                                        break
112                        
113                        print(f"Initial extraction: {methods_frameworks}")
114                        print(f"After verification: {verified_methods}")
115                        return verified_methods
116                    else:
117                        print(f"Couldn't find JSON array in: {result}")
118                        if attempt < max_retries - 1:
119                            # Add exponential backoff with jitter
120                            backoff_time = (2 ** attempt) + random.uniform(0, 1)
121                            print(f"Retrying in {backoff_time:.2f} seconds...")
122                            time.sleep(backoff_time)
123                            continue
124                        return []
125                except json.JSONDecodeError:
126                    print(f"Failed to parse JSON from: {result}")
127                    if attempt < max_retries - 1:
128                        # Add exponential backoff with jitter
129                        backoff_time = (2 ** attempt) + random.uniform(0, 1)
130                        print(f"Retrying in {backoff_time:.2f} seconds...")
131                        time.sleep(backoff_time)
132                        continue
133                    return []
134            else:
135                print(f"Error from Ollama API: {response.status_code} - {response.text}")
136                # If it's a server error, retry
137                if response.status_code >= 500:
138                    if attempt < max_retries - 1:
139                        # Add exponential backoff with jitter
140                        backoff_time = (2 ** attempt) + random.uniform(0, 1)
141                        print(f"Retrying in {backoff_time:.2f} seconds...")
142                        time.sleep(backoff_time)
143                        continue
144                return []
145                
146        except (Timeout, ConnectionError) as e:
147            print(f"Connection error on attempt {attempt + 1}: {str(e)}")
148            if attempt < max_retries - 1:
149                # Add exponential backoff with jitter
150                backoff_time = (2 ** attempt) + random.uniform(0, 1)
151                print(f"Retrying in {backoff_time:.2f} seconds...")
152                time.sleep(backoff_time)
153            else:
154                print(f"Max retries exceeded. Moving on without methods/frameworks extraction.")
155                return []
156        except Exception as e:
157            print(f"Unexpected exception: {str(e)}")
158            return []
159    
160    return []
161
162def fallback_extract_methods_frameworks(job_text):
163    """
164    Simple keyword-based methods and frameworks extraction as fallback when Ollama fails
165    """
166    if pd.isna(job_text) or job_text == "":
167        return []
168        
169    common_methods = [
170        "Agile", "Scrum", "Kanban", "Waterfall", "Lean", "Six Sigma", "DevOps", 
171        "SAFe", "PRINCE2", "ITIL", "XP", "Extreme Programming", "Design Thinking", 
172        "OKRs", "Objectives and Key Results", "TDD", "BDD", "Test Driven Development",
173        "Behavior Driven Development", "FDD", "Feature Driven Development", "RUP",
174        "Rational Unified Process", "Crystal", "DSDM", "Dynamic Systems Development Method",
175        "LeSS", "Large-Scale Scrum", "DAD", "Disciplined Agile Delivery", "AUP",
176        "Agile Unified Process", "PMBOK", "Project Management Body of Knowledge",
177        "RAD", "Rapid Application Development", "MSF", "Microsoft Solutions Framework"
178    ]
179    
180    job_text_lower = job_text.lower()
181    found_methods = []
182    
183    # Common variations and synonyms for methods and frameworks
184    method_synonyms = {
185        "agile": ["agile methodology", "agile environment", "agile principles", "agile practices"],
186        "scrum": ["scrum master", "scrum team", "scrum methodology", "scrum framework"],
187        "kanban": ["kanban board", "kanban method", "kanban methodology"],
188        "waterfall": ["waterfall model", "waterfall methodology", "waterfall approach"],
189        "lean": ["lean methodology", "lean principles", "lean thinking", "lean management"],
190        "six sigma": ["6 sigma", "six-sigma", "6-sigma", "lean six sigma"],
191        "devops": ["dev ops", "dev-ops", "devops practices", "devops culture"],
192        "safe": ["scaled agile framework", "safe framework", "safe methodology"],
193        "prince2": ["prince 2", "prince-2", "projects in controlled environments"],
194        "itil": ["information technology infrastructure library", "itil framework", "itil processes"],
195        "extreme programming": ["xp", "xp methodology", "extreme programming practices"],
196        "design thinking": ["design-thinking", "design thinking methodology"],
197        "okrs": ["objectives and key results", "okr framework", "okr methodology"]
198    }
199    
200    for method in common_methods:
201        method_lower = method.lower()
202        
203        # Direct match
204        if method_lower in job_text_lower:
205            found_methods.append(method)
206            continue
207            
208        # Check synonyms
209        for main_method, synonyms in method_synonyms.items():
210            if method_lower == main_method or method_lower in synonyms:
211                if any(syn in job_text_lower for syn in synonyms) or main_method in job_text_lower:
212                    found_methods.append(main_method.title())
213                    break
214    
215    return list(set(found_methods))  # Remove duplicates
216
217def main():
218    print("✅ Script started")
219
220    file_path = "seek_jobs.csv"
221    output_path = "seek_jobs_with_methods_frameworks.csv"
222    log_path = "frameworks_extraction_log.txt"
223    
224    # Setup logging to file
225    import sys
226    original_stdout = sys.stdout
227    log_file = open(log_path, 'w')
228    
229    try:
230        print(f"Reading dataset from {file_path}...")
231        df = pd.read_csv(file_path)
232        print(f"📄 Data loaded: {len(df)} rows")
233
234        title_column = "job_title"
235        description_column = "description"
236        job_details_column = "job_details"
237        job_type_column = "job_type"
238
239        print(f"Total records: {len(df)}")
240        print("Starting methods and frameworks extraction...\n")
241
242        all_methods_frameworks = []
243        extracted_methods_frameworks_str = []
244        used_fallback = []
245
246        processing_range = df
247
248        # Create a checkpoint system to save progress periodically
249        checkpoint_interval = 50
250        checkpoint_file = "frameworks_extraction_checkpoint.csv"
251        
252        # Load checkpoint if exists
253        import os
254        start_idx = 0
255        if os.path.exists(checkpoint_file):
256            checkpoint_df = pd.read_csv(checkpoint_file)
257            if 'extracted_methods_frameworks_str' in checkpoint_df.columns:
258                extracted_so_far = checkpoint_df['extracted_methods_frameworks_str'].notna().sum()
259                start_idx = extracted_so_far
260                print(f"Resuming from checkpoint at index {start_idx}")
261                # Copy already processed rows
262                extracted_methods_frameworks_str = checkpoint_df['extracted_methods_frameworks_str'].tolist()
263                used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_methods_frameworks_str)
264            else:
265                extracted_methods_frameworks_str = [None] * len(df)
266                used_fallback = [False] * len(df)
267        else:
268            extracted_methods_frameworks_str = [None] * len(df)
269            used_fallback = [False] * len(df)
270
271        sys.stdout = log_file
272        for idx in range(start_idx, len(processing_range)):
273            row = processing_range.iloc[idx]
274            print(f"\nJob {idx + 1}/{len(processing_range)}:")
275
276            job_title = row[title_column] if title_column in df.columns else "Not specified"
277            print(f"Job Title: {job_title}")
278
279            # Combine text from multiple columns
280            combined_text = (
281                str(row.get(job_type_column, '')) + ' ' +
282                str(row.get(title_column, '')) + ' ' +
283                str(row.get(job_details_column, '')) + ' ' +
284                str(row.get(description_column, ''))
285            ).strip()
286
287            if combined_text:
288                print("Processing combined text:")
289                # Try with Ollama first
290                methods_frameworks = extract_methods_frameworks_with_ollama(combined_text)
291                
292                # If Ollama fails or returns empty, try fallback
293                if not methods_frameworks:
294                    print("Ollama extraction failed or returned empty. Using fallback extraction.")
295                    methods_frameworks = fallback_extract_methods_frameworks(combined_text)
296                    used_fallback[idx] = True
297                    
298                all_methods_frameworks.append(methods_frameworks)
299                extracted_methods_frameworks_str[idx] = (', '.join(methods_frameworks) if methods_frameworks else '')
300            else:
301                print("Combined text is empty for this job")
302                all_methods_frameworks.append([])
303                extracted_methods_frameworks_str[idx] = ''
304
305            print("-" * 50)
306            
307            # Save checkpoint periodically
308            if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):
309                print(f"Creating checkpoint at index {idx}")
310                temp_df = df.copy()
311                temp_df['extracted_methods_frameworks_str'] = extracted_methods_frameworks_str
312                temp_df['used_fallback'] = used_fallback
313                temp_df.to_csv(checkpoint_file, index=False)
314                
315            # Avoid overwhelming the local Ollama service
316            time.sleep(random.uniform(0.5, 1.5))
317
318        # Reset stdout and close log file
319        sys.stdout = original_stdout
320        log_file.close()
321        
322        df['extracted_methods_frameworks_str'] = extracted_methods_frameworks_str
323        df['used_fallback'] = used_fallback
324
325        df.to_csv(output_path, index=False)
326        print(f"\nUpdated dataset saved to {output_path}")
327
328        print("\n--- Sample Results ---")
329        print(df[[title_column, 'extracted_methods_frameworks_str', 'used_fallback']].head(10))
330        
331        # Print summary statistics
332        total_jobs = len(df)
333        jobs_with_methods = sum(1 for methods in extracted_methods_frameworks_str if methods)
334        jobs_using_fallback = sum(used_fallback)
335        
336        print(f"\n--- Summary Statistics ---")
337        print(f"Total jobs processed: {total_jobs}")
338        print(f"Jobs with extracted methods/frameworks: {jobs_with_methods} ({jobs_with_methods/total_jobs*100:.1f}%)")
339        print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")
340        print(f"\nDetailed logs saved to {log_path}")
341        
342    except Exception as e:
343        sys.stdout = original_stdout
344        log_file.close()
345        print(f"Script error: {str(e)}")
346        import traceback
347        traceback.print_exc()
348
349if __name__ == "__main__":
350    main()

1import json 2import pandas as pd 3import requests 4import time 5import random 6from requests.exceptions import Timeout, ConnectionError 7 8def extract_methods_frameworks_with_ollama(job_description, max_retries=3, timeout=60): 9 """ 10 Extract methods and frameworks from job description using Ollama with Gemma model 11 with improved error handling and retries 12 """ 13 # Handle missing or NaN values 14 if pd.isna(job_description) or job_description == "": 15 return [] 16 17 job_description_lower = job_description.lower() 18 19 prompt = f""" 20 You are a specialized methods and frameworks extractor for job descriptions. Your task is to identify ONLY project management methodologies, development frameworks, and working approaches mentioned in the job description. 21 22 STRICT RULES: 23 1. Extract ONLY methods, frameworks, and methodologies 24 2. Include ONLY: project methodologies, development frameworks, process frameworks, working approaches 25 3. Examples include: Scrum, Agile, Kanban, Waterfall, Lean, Six Sigma, DevOps, SAFe, PRINCE2, ITIL, XP (Extreme Programming) 26 4. Do NOT include: programming languages, job titles, soft skills, or technical tools (unless they are actual methodologies) 27 5. IMPORTANT: ONLY extract methods/frameworks that are EXPLICITLY mentioned in the text 28 6. If no methods or frameworks are mentioned, return an empty array [] 29 30 CORRECT EXAMPLES: 31 - INCLUDE: Scrum, Agile, Kanban, Waterfall, Lean, Design Thinking, OKRs, PRINCE2 32 - DO NOT INCLUDE: Python, Excel, leadership, communication, teamwork, SQL 33 34 Job Description: 35 {job_description} 36 37 Respond ONLY with a JSON array containing the list of methods and frameworks found. 38 Example response format: ["Agile", "Scrum", "Kanban"] 39 """ 40 41 # Make request to local Ollama API with retries 42 for attempt in range(max_retries): 43 try: 44 print(f"API call attempt {attempt + 1}/{max_retries}") 45 response = requests.post( 46 "http://localhost:11434/api/generate", 47 json={ 48 "model": "gemma3:1b", 49 "prompt": prompt, 50 "stream": False 51 }, 52 timeout=timeout # Increased timeout 53 ) 54 55 if response.status_code == 200: 56 result = response.json()["response"] 57 58 # Extract JSON array from the response 59 try: 60 # Try to find JSON array in the response 61 start_idx = result.find('[') 62 end_idx = result.rfind(']') + 1 63 64 if start_idx >= 0 and end_idx > start_idx: 65 json_str = result[start_idx:end_idx] 66 methods_frameworks = json.loads(json_str) 67 68 # Additional filtering to remove non-methodology items 69 non_methods = ["python", "java", "excel", "sql", "leadership", "communication", 70 "teamwork", "degree", "bachelor", "master", "certification"] 71 72 # Filter out any items that are likely not methods/frameworks 73 filtered_methods = [method for method in methods_frameworks 74 if method.lower() not in [nm.lower() for nm in non_methods] 75 and len(method) > 1] # Avoid single characters 76 77 # CRITICAL: Verify each method/framework actually appears in the job description 78 verified_methods = [] 79 80 # Common variations and synonyms for methods and frameworks 81 method_synonyms = { 82 "agile": ["agile methodology", "agile environment", "agile principles", "agile practices"], 83 "scrum": ["scrum master", "scrum team", "scrum methodology", "scrum framework"], 84 "kanban": ["kanban board", "kanban method", "kanban methodology"], 85 "waterfall": ["waterfall model", "waterfall methodology", "waterfall approach"], 86 "lean": ["lean methodology", "lean principles", "lean thinking", "lean management"], 87 "six sigma": ["6 sigma", "six-sigma", "6-sigma", "lean six sigma"], 88 "devops": ["dev ops", "dev-ops", "devops practices", "devops culture"], 89 "safe": ["scaled agile framework", "safe framework", "safe methodology"], 90 "prince2": ["prince 2", "prince-2", "projects in controlled environments"], 91 "itil": ["information technology infrastructure library", "itil framework", "itil processes"], 92 "extreme programming": ["xp", "xp methodology", "extreme programming practices"], 93 "design thinking": ["design-thinking", "design thinking methodology"], 94 "okrs": ["objectives and key results", "okr framework", "okr methodology"] 95 } 96 97 for method in filtered_methods: 98 method_lower = method.lower() 99 100 # Check if the method directly appears in the job description 101 if method_lower in job_description_lower: 102 verified_methods.append(method) 103 continue 104 105 # Check for synonyms and variations 106 for main_method, synonyms in method_synonyms.items(): 107 if method_lower == main_method or method_lower in synonyms: 108 # Check if any synonyms appear in the text 109 if any(syn in job_description_lower for syn in synonyms) or main_method in job_description_lower: 110 verified_methods.append(main_method.title()) # Use standardized version 111 break 112 113 print(f"Initial extraction: {methods_frameworks}") 114 print(f"After verification: {verified_methods}") 115 return verified_methods 116 else: 117 print(f"Couldn't find JSON array in: {result}") 118 if attempt < max_retries - 1: 119 # Add exponential backoff with jitter 120 backoff_time = (2 ** attempt) + random.uniform(0, 1) 121 print(f"Retrying in {backoff_time:.2f} seconds...") 122 time.sleep(backoff_time) 123 continue 124 return [] 125 except json.JSONDecodeError: 126 print(f"Failed to parse JSON from: {result}") 127 if attempt < max_retries - 1: 128 # Add exponential backoff with jitter 129 backoff_time = (2 ** attempt) + random.uniform(0, 1) 130 print(f"Retrying in {backoff_time:.2f} seconds...") 131 time.sleep(backoff_time) 132 continue 133 return [] 134 else: 135 print(f"Error from Ollama API: {response.status_code} - {response.text}") 136 # If it's a server error, retry 137 if response.status_code >= 500: 138 if attempt < max_retries - 1: 139 # Add exponential backoff with jitter 140 backoff_time = (2 ** attempt) + random.uniform(0, 1) 141 print(f"Retrying in {backoff_time:.2f} seconds...") 142 time.sleep(backoff_time) 143 continue 144 return [] 145 146 except (Timeout, ConnectionError) as e: 147 print(f"Connection error on attempt {attempt + 1}: {str(e)}") 148 if attempt < max_retries - 1: 149 # Add exponential backoff with jitter 150 backoff_time = (2 ** attempt) + random.uniform(0, 1) 151 print(f"Retrying in {backoff_time:.2f} seconds...") 152 time.sleep(backoff_time) 153 else: 154 print(f"Max retries exceeded. Moving on without methods/frameworks extraction.") 155 return [] 156 except Exception as e: 157 print(f"Unexpected exception: {str(e)}") 158 return [] 159 160 return [] 161 162def fallback_extract_methods_frameworks(job_text): 163 """ 164 Simple keyword-based methods and frameworks extraction as fallback when Ollama fails 165 """ 166 if pd.isna(job_text) or job_text == "": 167 return [] 168 169 common_methods = [ 170 "Agile", "Scrum", "Kanban", "Waterfall", "Lean", "Six Sigma", "DevOps", 171 "SAFe", "PRINCE2", "ITIL", "XP", "Extreme Programming", "Design Thinking", 172 "OKRs", "Objectives and Key Results", "TDD", "BDD", "Test Driven Development", 173 "Behavior Driven Development", "FDD", "Feature Driven Development", "RUP", 174 "Rational Unified Process", "Crystal", "DSDM", "Dynamic Systems Development Method", 175 "LeSS", "Large-Scale Scrum", "DAD", "Disciplined Agile Delivery", "AUP", 176 "Agile Unified Process", "PMBOK", "Project Management Body of Knowledge", 177 "RAD", "Rapid Application Development", "MSF", "Microsoft Solutions Framework" 178 ] 179 180 job_text_lower = job_text.lower() 181 found_methods = [] 182 183 # Common variations and synonyms for methods and frameworks 184 method_synonyms = { 185 "agile": ["agile methodology", "agile environment", "agile principles", "agile practices"], 186 "scrum": ["scrum master", "scrum team", "scrum methodology", "scrum framework"], 187 "kanban": ["kanban board", "kanban method", "kanban methodology"], 188 "waterfall": ["waterfall model", "waterfall methodology", "waterfall approach"], 189 "lean": ["lean methodology", "lean principles", "lean thinking", "lean management"], 190 "six sigma": ["6 sigma", "six-sigma", "6-sigma", "lean six sigma"], 191 "devops": ["dev ops", "dev-ops", "devops practices", "devops culture"], 192 "safe": ["scaled agile framework", "safe framework", "safe methodology"], 193 "prince2": ["prince 2", "prince-2", "projects in controlled environments"], 194 "itil": ["information technology infrastructure library", "itil framework", "itil processes"], 195 "extreme programming": ["xp", "xp methodology", "extreme programming practices"], 196 "design thinking": ["design-thinking", "design thinking methodology"], 197 "okrs": ["objectives and key results", "okr framework", "okr methodology"] 198 } 199 200 for method in common_methods: 201 method_lower = method.lower() 202 203 # Direct match 204 if method_lower in job_text_lower: 205 found_methods.append(method) 206 continue 207 208 # Check synonyms 209 for main_method, synonyms in method_synonyms.items(): 210 if method_lower == main_method or method_lower in synonyms: 211 if any(syn in job_text_lower for syn in synonyms) or main_method in job_text_lower: 212 found_methods.append(main_method.title()) 213 break 214 215 return list(set(found_methods)) # Remove duplicates 216 217def main(): 218 print("✅ Script started") 219 220 file_path = "seek_jobs.csv" 221 output_path = "seek_jobs_with_methods_frameworks.csv" 222 log_path = "frameworks_extraction_log.txt" 223 224 # Setup logging to file 225 import sys 226 original_stdout = sys.stdout 227 log_file = open(log_path, 'w') 228 229 try: 230 print(f"Reading dataset from {file_path}...") 231 df = pd.read_csv(file_path) 232 print(f"📄 Data loaded: {len(df)} rows") 233 234 title_column = "job_title" 235 description_column = "description" 236 job_details_column = "job_details" 237 job_type_column = "job_type" 238 239 print(f"Total records: {len(df)}") 240 print("Starting methods and frameworks extraction...\n") 241 242 all_methods_frameworks = [] 243 extracted_methods_frameworks_str = [] 244 used_fallback = [] 245 246 processing_range = df 247 248 # Create a checkpoint system to save progress periodically 249 checkpoint_interval = 50 250 checkpoint_file = "frameworks_extraction_checkpoint.csv" 251 252 # Load checkpoint if exists 253 import os 254 start_idx = 0 255 if os.path.exists(checkpoint_file): 256 checkpoint_df = pd.read_csv(checkpoint_file) 257 if 'extracted_methods_frameworks_str' in checkpoint_df.columns: 258 extracted_so_far = checkpoint_df['extracted_methods_frameworks_str'].notna().sum() 259 start_idx = extracted_so_far 260 print(f"Resuming from checkpoint at index {start_idx}") 261 # Copy already processed rows 262 extracted_methods_frameworks_str = checkpoint_df['extracted_methods_frameworks_str'].tolist() 263 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_methods_frameworks_str) 264 else: 265 extracted_methods_frameworks_str = [None] * len(df) 266 used_fallback = [False] * len(df) 267 else: 268 extracted_methods_frameworks_str = [None] * len(df) 269 used_fallback = [False] * len(df) 270 271 sys.stdout = log_file 272 for idx in range(start_idx, len(processing_range)): 273 row = processing_range.iloc[idx] 274 print(f"\nJob {idx + 1}/{len(processing_range)}:") 275 276 job_title = row[title_column] if title_column in df.columns else "Not specified" 277 print(f"Job Title: {job_title}") 278 279 # Combine text from multiple columns 280 combined_text = ( 281 str(row.get(job_type_column, '')) + ' ' + 282 str(row.get(title_column, '')) + ' ' + 283 str(row.get(job_details_column, '')) + ' ' + 284 str(row.get(description_column, '')) 285 ).strip() 286 287 if combined_text: 288 print("Processing combined text:") 289 # Try with Ollama first 290 methods_frameworks = extract_methods_frameworks_with_ollama(combined_text) 291 292 # If Ollama fails or returns empty, try fallback 293 if not methods_frameworks: 294 print("Ollama extraction failed or returned empty. Using fallback extraction.") 295 methods_frameworks = fallback_extract_methods_frameworks(combined_text) 296 used_fallback[idx] = True 297 298 all_methods_frameworks.append(methods_frameworks) 299 extracted_methods_frameworks_str[idx] = (', '.join(methods_frameworks) if methods_frameworks else '') 300 else: 301 print("Combined text is empty for this job") 302 all_methods_frameworks.append([]) 303 extracted_methods_frameworks_str[idx] = '' 304 305 print("-" * 50) 306 307 # Save checkpoint periodically 308 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range): 309 print(f"Creating checkpoint at index {idx}") 310 temp_df = df.copy() 311 temp_df['extracted_methods_frameworks_str'] = extracted_methods_frameworks_str 312 temp_df['used_fallback'] = used_fallback 313 temp_df.to_csv(checkpoint_file, index=False) 314 315 # Avoid overwhelming the local Ollama service 316 time.sleep(random.uniform(0.5, 1.5)) 317 318 # Reset stdout and close log file 319 sys.stdout = original_stdout 320 log_file.close() 321 322 df['extracted_methods_frameworks_str'] = extracted_methods_frameworks_str 323 df['used_fallback'] = used_fallback 324 325 df.to_csv(output_path, index=False) 326 print(f"\nUpdated dataset saved to {output_path}") 327 328 print("\n--- Sample Results ---") 329 print(df[[title_column, 'extracted_methods_frameworks_str', 'used_fallback']].head(10)) 330 331 # Print summary statistics 332 total_jobs = len(df) 333 jobs_with_methods = sum(1 for methods in extracted_methods_frameworks_str if methods) 334 jobs_using_fallback = sum(used_fallback) 335 336 print(f"\n--- Summary Statistics ---") 337 print(f"Total jobs processed: {total_jobs}") 338 print(f"Jobs with extracted methods/frameworks: {jobs_with_methods} ({jobs_with_methods/total_jobs*100:.1f}%)") 339 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)") 340 print(f"\nDetailed logs saved to {log_path}") 341 342 except Exception as e: 343 sys.stdout = original_stdout 344 log_file.close() 345 print(f"Script error: {str(e)}") 346 import traceback 347 traceback.print_exc() 348 349if __name__ == "__main__": 350 main()

Methods and Framework extraction and saving data into csv file(Source code)

Next Up

Lesson 33: Language extraction and saving data into csv file(Source code)

Next Up

Lesson 33: Language extraction and saving data into csv file(Source code)