AusBiz Consulting

✅ Compared to previous lesson no. 28, this script improves on soft skill extraction by verifying results, saving it in seek_jobs_with_languages.csv and handling AI/API failures gracefully.

✅ Extracts programming, scripting, and markup languages from job descriptions using Ollama’s Gemma model, with strict rule-based prompting.

✅ Implements robust error handling, retry logic, and exponential backoff for reliable API calls to the local Ollama service.

✅ Falls back to a keyword- and synonym-based method when AI extraction fails, ensuring no job is skipped.

✅ Verifies each extracted language against the original job description text to avoid false positives.

✅ Automatically saves progress to a checkpoint file after every 50 jobs to enable resumption and prevent data loss.

Python

1import json
2import pandas as pd
3import requests
4import time
5import random
6from requests.exceptions import Timeout, ConnectionError
7
8def extract_programming_languages_with_ollama(job_description, max_retries=3, timeout=60):
9    """
10    Extract programming languages from job description using Ollama with Gemma model
11    with improved error handling and retries
12    """
13    # Handle missing or NaN values
14    if pd.isna(job_description) or job_description == "":
15        return []
16    
17    job_description_lower = job_description.lower()
18    
19    prompt = f"""
20    You are a specialized programming languages extractor for job descriptions. Your task is to identify ONLY programming, scripting, and markup languages mentioned in the job description.
21
22    STRICT RULES:
23    1. Extract ONLY programming, scripting, and markup languages
24    2. Include ONLY: languages like Python, Java, JavaScript, TypeScript, C#, C++, Ruby, PHP, Swift, Kotlin, Go, SQL, HTML, CSS, R, Scala, Rust, MATLAB, Perl, Bash, PowerShell, etc.
25    3. Do NOT include: frameworks (React, Angular), libraries (jQuery, TensorFlow), tools (Git, Docker), platforms (AWS, Azure), or methodologies (Agile, Scrum)
26    4. Do NOT include job titles, responsibilities, or soft skills
27    5. IMPORTANT: ONLY extract languages that are EXPLICITLY mentioned in the text
28    6. If no programming languages are mentioned, return an empty array []
29    7. Recognize language variations (e.g., "JS" = "JavaScript", "TS" = "TypeScript")
30
31    CORRECT EXAMPLES:
32    - INCLUDE: "Python", "Java", "C#", "JavaScript", "TypeScript", "SQL", "HTML", "CSS"
33    - DO NOT INCLUDE: "React", "Angular", "Node.js", "Django", "Git", "AWS", "Agile"
34
35    Job Description:
36    {job_description}
37
38    Respond ONLY with a JSON array containing the list of programming languages found.
39    Example response format: ["Python", "JavaScript", "SQL"]
40    """
41    
42    # Make request to local Ollama API with retries and backoff
43    for attempt in range(max_retries):
44        try:
45            print(f"API call attempt {attempt + 1}/{max_retries}")
46            response = requests.post(
47                "http://localhost:11434/api/generate",
48                json={
49                    "model": "gemma3:1b",
50                    "prompt": prompt,
51                    "stream": False
52                },
53                timeout=timeout  # Increased timeout
54            )
55            
56            if response.status_code == 200:
57                result = response.json()["response"]
58                
59                # Extract JSON array from the response
60                try:
61                    # Try to find JSON array in the response
62                    start_idx = result.find('[')
63                    end_idx = result.rfind(']') + 1
64                    
65                    if start_idx >= 0 and end_idx > start_idx:
66                        json_str = result[start_idx:end_idx]
67                        languages = json.loads(json_str)
68                        
69                        # Additional filtering to remove non-language items
70                        non_languages = ["react", "angular", "node.js", "django", "flask", "spring", "express", 
71                                        "vue", "scrum", "agile", "aws", "azure", "docker", "kubernetes", 
72                                        "git", "jenkins", "jira", "agile", "scrum", "kanban", "teamwork"]
73                        
74                        # Filter out any items that are likely not programming languages
75                        filtered_languages = [lang for lang in languages 
76                                             if lang.lower() not in [nl.lower() for nl in non_languages]
77                                             and len(lang) > 1]  # Avoid single characters except for specific cases like R
78                        
79                        # Special case for single character languages like R
80                        if "R" in languages and "R" not in filtered_languages:
81                            filtered_languages.append("R")
82                        
83                        # CRITICAL: Verify each language actually appears in the job description
84                        verified_languages = []
85                        
86                        # Common variations and synonyms for programming languages
87                        language_synonyms = {
88                            "javascript": ["js", "javascript", "ecmascript"],
89                            "typescript": ["ts", "typescript"],
90                            "python": ["py", "python3", "python 3", "python2", "python 2"],
91                            "java": ["java", "jvm"],
92                            "c#": ["c#", "csharp", "c sharp"],
93                            "c++": ["c++", "cpp", "cplusplus", "c plus plus"],
94                            "ruby": ["rb", "ruby on rails", "rails"],
95                            "php": ["php"],
96                            "swift": ["swift"],
97                            "kotlin": ["kt", "kotlin"],
98                            "go": ["golang", "go language"],
99                            "sql": ["sql", "tsql", "plsql", "mysql", "postgresql", "oracle sql", "sql server"],
100                            "html": ["html", "html5", "html 5"],
101                            "css": ["css", "css3", "css 3", "scss", "sass"],
102                            "r": ["r programming", "r language"],
103                            "scala": ["scala"],
104                            "rust": ["rust", "rust lang"],
105                            "matlab": ["matlab"],
106                            "perl": ["perl"],
107                            "bash": ["bash", "shell", "shell script", "shell scripting"],
108                            "powershell": ["powershell", "power shell"],
109                            "vba": ["visual basic", "visual basic for applications"],
110                            "groovy": ["groovy"],
111                            "fortran": ["fortran"]
112                        }
113                        
114                        for language in filtered_languages:
115                            language_lower = language.lower()
116                            
117                            # Check if the language directly appears in the job description
118                            if language_lower in job_description_lower:
119                                verified_languages.append(language)
120                                continue
121                            
122                            # Check for synonyms and variations
123                            for main_lang, synonyms in language_synonyms.items():
124                                if language_lower == main_lang or language_lower in synonyms:
125                                    # Check if any synonyms appear in the text
126                                    if any(syn in job_description_lower for syn in synonyms) or main_lang in job_description_lower:
127                                        # Standardize the language name with proper capitalization
128                                        if main_lang == "javascript":
129                                            verified_languages.append("JavaScript")
130                                        elif main_lang == "typescript":
131                                            verified_languages.append("TypeScript")
132                                        elif main_lang == "c#":
133                                            verified_languages.append("C#")
134                                        elif main_lang == "c++":
135                                            verified_languages.append("C++")
136                                        elif main_lang == "php":
137                                            verified_languages.append("PHP")
138                                        elif main_lang == "sql":
139                                            verified_languages.append("SQL")
140                                        elif main_lang == "html":
141                                            verified_languages.append("HTML")
142                                        elif main_lang == "css":
143                                            verified_languages.append("CSS")
144                                        elif main_lang == "r":
145                                            verified_languages.append("R")
146                                        elif main_lang == "vba":
147                                            verified_languages.append("VBA")
148                                        else:
149                                            verified_languages.append(main_lang.title())  # Use standardized version
150                                        break
151                        
152                        # Handle special case for "C" language to avoid false positives
153                        if "C" in languages and not any(lang in verified_languages for lang in ["C++", "C#"]):
154                            # Verify it's actually the C language with context patterns
155                            c_language_patterns = ["c programming", "c language", "ansi c", 
156                                                  "programming in c", "code in c", "c developer"]
157                            if any(pattern in job_description_lower for pattern in c_language_patterns):
158                                verified_languages.append("C")
159                        
160                        print(f"Initial extraction: {languages}")
161                        print(f"After verification: {verified_languages}")
162                        return verified_languages
163                    else:
164                        print(f"Couldn't find JSON array in: {result}")
165                        if attempt < max_retries - 1:
166                            # Add exponential backoff with jitter
167                            backoff_time = (2 ** attempt) + random.uniform(0, 1)
168                            print(f"Retrying in {backoff_time:.2f} seconds...")
169                            time.sleep(backoff_time)
170                            continue
171                        return []
172                except json.JSONDecodeError:
173                    print(f"Failed to parse JSON from: {result}")
174                    if attempt < max_retries - 1:
175                        # Add exponential backoff with jitter
176                        backoff_time = (2 ** attempt) + random.uniform(0, 1)
177                        print(f"Retrying in {backoff_time:.2f} seconds...")
178                        time.sleep(backoff_time)
179                        continue
180                    return []
181            else:
182                print(f"Error from Ollama API: {response.status_code} - {response.text}")
183                # If it's a server error, retry
184                if response.status_code >= 500:
185                    if attempt < max_retries - 1:
186                        # Add exponential backoff with jitter
187                        backoff_time = (2 ** attempt) + random.uniform(0, 1)
188                        print(f"Retrying in {backoff_time:.2f} seconds...")
189                        time.sleep(backoff_time)
190                        continue
191                return []
192                
193        except (Timeout, ConnectionError) as e:
194            print(f"Connection error on attempt {attempt + 1}: {str(e)}")
195            if attempt < max_retries - 1:
196                # Add exponential backoff with jitter
197                backoff_time = (2 ** attempt) + random.uniform(0, 1)
198                print(f"Retrying in {backoff_time:.2f} seconds...")
199                time.sleep(backoff_time)
200            else:
201                print(f"Max retries exceeded. Moving on without languages extraction.")
202                return []
203        except Exception as e:
204            print(f"Unexpected exception: {str(e)}")
205            return []
206    
207    return []
208
209def fallback_extract_languages(job_text):
210    """
211    Simple keyword-based programming language extraction as fallback when Ollama fails
212    """
213    if pd.isna(job_text) or job_text == "":
214        return []
215        
216    common_languages = [
217        "Python", "Java", "JavaScript", "TypeScript", "C#", "C++", "C", "Ruby", "PHP", "Go", "Rust",
218        "SQL", "HTML", "CSS", "R", "Scala", "Swift", "Kotlin", "MATLAB", "Perl", "Bash", 
219        "PowerShell", "VBA", "Groovy", "Fortran", "Lua", "Haskell", "Clojure", "Erlang", "F#",
220        "COBOL", "Assembly", "Dart", "Objective-C", "Julia", "Lisp", "Scheme", "Prolog"
221    ]
222    
223    language_synonyms = {
224        "JavaScript": ["js", "ecmascript"],
225        "TypeScript": ["ts"],
226        "Python": ["py", "python3", "python2"],
227        "C#": ["csharp", "c sharp"],
228        "C++": ["cpp", "cplusplus", "c plus plus"],
229        "Ruby": ["rb"],
230        "SQL": ["mysql", "postgresql", "tsql", "plsql", "oracle sql", "sql server"],
231        "HTML": ["html5"],
232        "CSS": ["css3", "scss", "sass"],
233        "R": ["r programming", "r language"],
234        "Bash": ["shell", "shell script", "shell scripting"],
235        "PowerShell": ["power shell"],
236        "VBA": ["visual basic", "visual basic for applications"]
237    }
238    
239    job_text_lower = job_text.lower()
240    found_languages = []
241    
242    # First check for direct matches
243    for language in common_languages:
244        if language.lower() in job_text_lower:
245            found_languages.append(language)
246            continue
247            
248        # Then check synonyms
249        if language in language_synonyms:
250            for synonym in language_synonyms[language]:
251                if synonym.lower() in job_text_lower:
252                    found_languages.append(language)
253                    break
254    
255    # Special handling for C language to avoid false positives
256    if "C" in found_languages and ("C++" in found_languages or "C#" in found_languages):
257        # Verify it's actually the C language with context patterns
258        c_language_patterns = ["c programming", "c language", "ansi c", 
259                              "programming in c", "code in c", "c developer"]
260        if not any(pattern in job_text_lower for pattern in c_language_patterns):
261            found_languages.remove("C")
262    
263    return found_languages
264
265def main():
266    print("✅ Script started")
267
268    file_path = "seek_jobs.csv"
269    output_path = "seek_jobs_with_languages.csv"
270    log_path = "language_extraction_log.txt"
271    
272    # Setup logging to file
273    import sys
274    original_stdout = sys.stdout
275    log_file = open(log_path, 'w')
276    
277    try:
278        print(f"Reading dataset from {file_path}...")
279        df = pd.read_csv(file_path)
280        print(f"📄 Data loaded: {len(df)} rows")
281
282        # Columns to extract programming languages from
283        title_column = "job_title"
284        description_column = "description"
285        job_details_column = "job_details"
286        job_type_column = "job_type"
287
288        print(f"Total records: {len(df)}")
289        print("Starting programming languages extraction...\n")
290
291        all_languages = []
292        extracted_languages_str = []
293        used_fallback = []
294
295        processing_range = df
296
297        # Create a checkpoint system to save progress periodically
298        checkpoint_interval = 50
299        checkpoint_file = "language_extraction_checkpoint.csv"
300        
301        # Load checkpoint if exists
302        import os
303        start_idx = 0
304        if os.path.exists(checkpoint_file):
305            checkpoint_df = pd.read_csv(checkpoint_file)
306            if 'extracted_programming_languages_str' in checkpoint_df.columns:
307                extracted_so_far = checkpoint_df['extracted_programming_languages_str'].notna().sum()
308                start_idx = extracted_so_far
309                print(f"Resuming from checkpoint at index {start_idx}")
310                # Copy already processed rows
311                extracted_languages_str = checkpoint_df['extracted_programming_languages_str'].tolist()
312                used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_languages_str)
313            else:
314                extracted_languages_str = [None] * len(df)
315                used_fallback = [False] * len(df)
316        else:
317            extracted_languages_str = [None] * len(df)
318            used_fallback = [False] * len(df)
319
320        sys.stdout = log_file
321        for idx in range(start_idx, len(processing_range)):
322            row = processing_range.iloc[idx]
323            print(f"\nJob {idx + 1}/{len(processing_range)}:")
324
325            job_title = row[title_column] if title_column in df.columns else "Not specified"
326            print(f"Job Title: {job_title}")
327
328            # Combine text from multiple columns
329            combined_text = (
330                str(row.get(job_type_column, '')) + ' ' +
331                str(row.get(title_column, '')) + ' ' +
332                str(row.get(job_details_column, '')) + ' ' +
333                str(row.get(description_column, ''))
334            ).strip()
335
336            if combined_text:
337                print("Processing combined text:")
338                # Try with Ollama first
339                languages = extract_programming_languages_with_ollama(combined_text)
340                
341                # If Ollama fails or returns empty, try fallback
342                if not languages:
343                    print("Ollama extraction failed or returned empty. Using fallback extraction.")
344                    languages = fallback_extract_languages(combined_text)
345                    used_fallback[idx] = True
346                    
347                all_languages.append(languages)
348                extracted_languages_str[idx] = (', '.join(languages) if languages else '')
349            else:
350                print("Combined text is empty for this job")
351                all_languages.append([])
352                extracted_languages_str[idx] = ''
353
354            print("-" * 50)
355            
356            # Save checkpoint periodically
357            if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range):
358                print(f"Creating checkpoint at index {idx}")
359                temp_df = df.copy()
360                temp_df['extracted_programming_languages_str'] = extracted_languages_str
361                temp_df['used_fallback'] = used_fallback
362                temp_df.to_csv(checkpoint_file, index=False)
363                
364            # Avoid overwhelming the local Ollama service
365            time.sleep(random.uniform(0.5, 1.5))
366
367        # Reset stdout and close log file
368        sys.stdout = original_stdout
369        log_file.close()
370        
371        df['extracted_programming_languages_str'] = extracted_languages_str
372        df['used_fallback'] = used_fallback
373
374        df.to_csv(output_path, index=False)
375        print(f"\nUpdated dataset saved to {output_path}")
376
377        print("\n--- Sample Results ---")
378        print(df[[title_column, 'extracted_programming_languages_str', 'used_fallback']].head(10))
379        
380        # Print summary statistics
381        total_jobs = len(df)
382        jobs_with_languages = sum(1 for langs in extracted_languages_str if langs)
383        jobs_using_fallback = sum(used_fallback)
384        
385        print(f"\n--- Summary Statistics ---")
386        print(f"Total jobs processed: {total_jobs}")
387        print(f"Jobs with extracted languages: {jobs_with_languages} ({jobs_with_languages/total_jobs*100:.1f}%)")
388        print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)")
389        print(f"\nDetailed logs saved to {log_path}")
390        
391    except Exception as e:
392        sys.stdout = original_stdout
393        log_file.close()
394        print(f"Script error: {str(e)}")
395        import traceback
396        traceback.print_exc()
397
398if __name__ == "__main__":
399    main()

1import json 2import pandas as pd 3import requests 4import time 5import random 6from requests.exceptions import Timeout, ConnectionError 7 8def extract_programming_languages_with_ollama(job_description, max_retries=3, timeout=60): 9 """ 10 Extract programming languages from job description using Ollama with Gemma model 11 with improved error handling and retries 12 """ 13 # Handle missing or NaN values 14 if pd.isna(job_description) or job_description == "": 15 return [] 16 17 job_description_lower = job_description.lower() 18 19 prompt = f""" 20 You are a specialized programming languages extractor for job descriptions. Your task is to identify ONLY programming, scripting, and markup languages mentioned in the job description. 21 22 STRICT RULES: 23 1. Extract ONLY programming, scripting, and markup languages 24 2. Include ONLY: languages like Python, Java, JavaScript, TypeScript, C#, C++, Ruby, PHP, Swift, Kotlin, Go, SQL, HTML, CSS, R, Scala, Rust, MATLAB, Perl, Bash, PowerShell, etc. 25 3. Do NOT include: frameworks (React, Angular), libraries (jQuery, TensorFlow), tools (Git, Docker), platforms (AWS, Azure), or methodologies (Agile, Scrum) 26 4. Do NOT include job titles, responsibilities, or soft skills 27 5. IMPORTANT: ONLY extract languages that are EXPLICITLY mentioned in the text 28 6. If no programming languages are mentioned, return an empty array [] 29 7. Recognize language variations (e.g., "JS" = "JavaScript", "TS" = "TypeScript") 30 31 CORRECT EXAMPLES: 32 - INCLUDE: "Python", "Java", "C#", "JavaScript", "TypeScript", "SQL", "HTML", "CSS" 33 - DO NOT INCLUDE: "React", "Angular", "Node.js", "Django", "Git", "AWS", "Agile" 34 35 Job Description: 36 {job_description} 37 38 Respond ONLY with a JSON array containing the list of programming languages found. 39 Example response format: ["Python", "JavaScript", "SQL"] 40 """ 41 42 # Make request to local Ollama API with retries and backoff 43 for attempt in range(max_retries): 44 try: 45 print(f"API call attempt {attempt + 1}/{max_retries}") 46 response = requests.post( 47 "http://localhost:11434/api/generate", 48 json={ 49 "model": "gemma3:1b", 50 "prompt": prompt, 51 "stream": False 52 }, 53 timeout=timeout # Increased timeout 54 ) 55 56 if response.status_code == 200: 57 result = response.json()["response"] 58 59 # Extract JSON array from the response 60 try: 61 # Try to find JSON array in the response 62 start_idx = result.find('[') 63 end_idx = result.rfind(']') + 1 64 65 if start_idx >= 0 and end_idx > start_idx: 66 json_str = result[start_idx:end_idx] 67 languages = json.loads(json_str) 68 69 # Additional filtering to remove non-language items 70 non_languages = ["react", "angular", "node.js", "django", "flask", "spring", "express", 71 "vue", "scrum", "agile", "aws", "azure", "docker", "kubernetes", 72 "git", "jenkins", "jira", "agile", "scrum", "kanban", "teamwork"] 73 74 # Filter out any items that are likely not programming languages 75 filtered_languages = [lang for lang in languages 76 if lang.lower() not in [nl.lower() for nl in non_languages] 77 and len(lang) > 1] # Avoid single characters except for specific cases like R 78 79 # Special case for single character languages like R 80 if "R" in languages and "R" not in filtered_languages: 81 filtered_languages.append("R") 82 83 # CRITICAL: Verify each language actually appears in the job description 84 verified_languages = [] 85 86 # Common variations and synonyms for programming languages 87 language_synonyms = { 88 "javascript": ["js", "javascript", "ecmascript"], 89 "typescript": ["ts", "typescript"], 90 "python": ["py", "python3", "python 3", "python2", "python 2"], 91 "java": ["java", "jvm"], 92 "c#": ["c#", "csharp", "c sharp"], 93 "c++": ["c++", "cpp", "cplusplus", "c plus plus"], 94 "ruby": ["rb", "ruby on rails", "rails"], 95 "php": ["php"], 96 "swift": ["swift"], 97 "kotlin": ["kt", "kotlin"], 98 "go": ["golang", "go language"], 99 "sql": ["sql", "tsql", "plsql", "mysql", "postgresql", "oracle sql", "sql server"], 100 "html": ["html", "html5", "html 5"], 101 "css": ["css", "css3", "css 3", "scss", "sass"], 102 "r": ["r programming", "r language"], 103 "scala": ["scala"], 104 "rust": ["rust", "rust lang"], 105 "matlab": ["matlab"], 106 "perl": ["perl"], 107 "bash": ["bash", "shell", "shell script", "shell scripting"], 108 "powershell": ["powershell", "power shell"], 109 "vba": ["visual basic", "visual basic for applications"], 110 "groovy": ["groovy"], 111 "fortran": ["fortran"] 112 } 113 114 for language in filtered_languages: 115 language_lower = language.lower() 116 117 # Check if the language directly appears in the job description 118 if language_lower in job_description_lower: 119 verified_languages.append(language) 120 continue 121 122 # Check for synonyms and variations 123 for main_lang, synonyms in language_synonyms.items(): 124 if language_lower == main_lang or language_lower in synonyms: 125 # Check if any synonyms appear in the text 126 if any(syn in job_description_lower for syn in synonyms) or main_lang in job_description_lower: 127 # Standardize the language name with proper capitalization 128 if main_lang == "javascript": 129 verified_languages.append("JavaScript") 130 elif main_lang == "typescript": 131 verified_languages.append("TypeScript") 132 elif main_lang == "c#": 133 verified_languages.append("C#") 134 elif main_lang == "c++": 135 verified_languages.append("C++") 136 elif main_lang == "php": 137 verified_languages.append("PHP") 138 elif main_lang == "sql": 139 verified_languages.append("SQL") 140 elif main_lang == "html": 141 verified_languages.append("HTML") 142 elif main_lang == "css": 143 verified_languages.append("CSS") 144 elif main_lang == "r": 145 verified_languages.append("R") 146 elif main_lang == "vba": 147 verified_languages.append("VBA") 148 else: 149 verified_languages.append(main_lang.title()) # Use standardized version 150 break 151 152 # Handle special case for "C" language to avoid false positives 153 if "C" in languages and not any(lang in verified_languages for lang in ["C++", "C#"]): 154 # Verify it's actually the C language with context patterns 155 c_language_patterns = ["c programming", "c language", "ansi c", 156 "programming in c", "code in c", "c developer"] 157 if any(pattern in job_description_lower for pattern in c_language_patterns): 158 verified_languages.append("C") 159 160 print(f"Initial extraction: {languages}") 161 print(f"After verification: {verified_languages}") 162 return verified_languages 163 else: 164 print(f"Couldn't find JSON array in: {result}") 165 if attempt < max_retries - 1: 166 # Add exponential backoff with jitter 167 backoff_time = (2 ** attempt) + random.uniform(0, 1) 168 print(f"Retrying in {backoff_time:.2f} seconds...") 169 time.sleep(backoff_time) 170 continue 171 return [] 172 except json.JSONDecodeError: 173 print(f"Failed to parse JSON from: {result}") 174 if attempt < max_retries - 1: 175 # Add exponential backoff with jitter 176 backoff_time = (2 ** attempt) + random.uniform(0, 1) 177 print(f"Retrying in {backoff_time:.2f} seconds...") 178 time.sleep(backoff_time) 179 continue 180 return [] 181 else: 182 print(f"Error from Ollama API: {response.status_code} - {response.text}") 183 # If it's a server error, retry 184 if response.status_code >= 500: 185 if attempt < max_retries - 1: 186 # Add exponential backoff with jitter 187 backoff_time = (2 ** attempt) + random.uniform(0, 1) 188 print(f"Retrying in {backoff_time:.2f} seconds...") 189 time.sleep(backoff_time) 190 continue 191 return [] 192 193 except (Timeout, ConnectionError) as e: 194 print(f"Connection error on attempt {attempt + 1}: {str(e)}") 195 if attempt < max_retries - 1: 196 # Add exponential backoff with jitter 197 backoff_time = (2 ** attempt) + random.uniform(0, 1) 198 print(f"Retrying in {backoff_time:.2f} seconds...") 199 time.sleep(backoff_time) 200 else: 201 print(f"Max retries exceeded. Moving on without languages extraction.") 202 return [] 203 except Exception as e: 204 print(f"Unexpected exception: {str(e)}") 205 return [] 206 207 return [] 208 209def fallback_extract_languages(job_text): 210 """ 211 Simple keyword-based programming language extraction as fallback when Ollama fails 212 """ 213 if pd.isna(job_text) or job_text == "": 214 return [] 215 216 common_languages = [ 217 "Python", "Java", "JavaScript", "TypeScript", "C#", "C++", "C", "Ruby", "PHP", "Go", "Rust", 218 "SQL", "HTML", "CSS", "R", "Scala", "Swift", "Kotlin", "MATLAB", "Perl", "Bash", 219 "PowerShell", "VBA", "Groovy", "Fortran", "Lua", "Haskell", "Clojure", "Erlang", "F#", 220 "COBOL", "Assembly", "Dart", "Objective-C", "Julia", "Lisp", "Scheme", "Prolog" 221 ] 222 223 language_synonyms = { 224 "JavaScript": ["js", "ecmascript"], 225 "TypeScript": ["ts"], 226 "Python": ["py", "python3", "python2"], 227 "C#": ["csharp", "c sharp"], 228 "C++": ["cpp", "cplusplus", "c plus plus"], 229 "Ruby": ["rb"], 230 "SQL": ["mysql", "postgresql", "tsql", "plsql", "oracle sql", "sql server"], 231 "HTML": ["html5"], 232 "CSS": ["css3", "scss", "sass"], 233 "R": ["r programming", "r language"], 234 "Bash": ["shell", "shell script", "shell scripting"], 235 "PowerShell": ["power shell"], 236 "VBA": ["visual basic", "visual basic for applications"] 237 } 238 239 job_text_lower = job_text.lower() 240 found_languages = [] 241 242 # First check for direct matches 243 for language in common_languages: 244 if language.lower() in job_text_lower: 245 found_languages.append(language) 246 continue 247 248 # Then check synonyms 249 if language in language_synonyms: 250 for synonym in language_synonyms[language]: 251 if synonym.lower() in job_text_lower: 252 found_languages.append(language) 253 break 254 255 # Special handling for C language to avoid false positives 256 if "C" in found_languages and ("C++" in found_languages or "C#" in found_languages): 257 # Verify it's actually the C language with context patterns 258 c_language_patterns = ["c programming", "c language", "ansi c", 259 "programming in c", "code in c", "c developer"] 260 if not any(pattern in job_text_lower for pattern in c_language_patterns): 261 found_languages.remove("C") 262 263 return found_languages 264 265def main(): 266 print("✅ Script started") 267 268 file_path = "seek_jobs.csv" 269 output_path = "seek_jobs_with_languages.csv" 270 log_path = "language_extraction_log.txt" 271 272 # Setup logging to file 273 import sys 274 original_stdout = sys.stdout 275 log_file = open(log_path, 'w') 276 277 try: 278 print(f"Reading dataset from {file_path}...") 279 df = pd.read_csv(file_path) 280 print(f"📄 Data loaded: {len(df)} rows") 281 282 # Columns to extract programming languages from 283 title_column = "job_title" 284 description_column = "description" 285 job_details_column = "job_details" 286 job_type_column = "job_type" 287 288 print(f"Total records: {len(df)}") 289 print("Starting programming languages extraction...\n") 290 291 all_languages = [] 292 extracted_languages_str = [] 293 used_fallback = [] 294 295 processing_range = df 296 297 # Create a checkpoint system to save progress periodically 298 checkpoint_interval = 50 299 checkpoint_file = "language_extraction_checkpoint.csv" 300 301 # Load checkpoint if exists 302 import os 303 start_idx = 0 304 if os.path.exists(checkpoint_file): 305 checkpoint_df = pd.read_csv(checkpoint_file) 306 if 'extracted_programming_languages_str' in checkpoint_df.columns: 307 extracted_so_far = checkpoint_df['extracted_programming_languages_str'].notna().sum() 308 start_idx = extracted_so_far 309 print(f"Resuming from checkpoint at index {start_idx}") 310 # Copy already processed rows 311 extracted_languages_str = checkpoint_df['extracted_programming_languages_str'].tolist() 312 used_fallback = checkpoint_df['used_fallback'].tolist() if 'used_fallback' in checkpoint_df.columns else [False] * len(extracted_languages_str) 313 else: 314 extracted_languages_str = [None] * len(df) 315 used_fallback = [False] * len(df) 316 else: 317 extracted_languages_str = [None] * len(df) 318 used_fallback = [False] * len(df) 319 320 sys.stdout = log_file 321 for idx in range(start_idx, len(processing_range)): 322 row = processing_range.iloc[idx] 323 print(f"\nJob {idx + 1}/{len(processing_range)}:") 324 325 job_title = row[title_column] if title_column in df.columns else "Not specified" 326 print(f"Job Title: {job_title}") 327 328 # Combine text from multiple columns 329 combined_text = ( 330 str(row.get(job_type_column, '')) + ' ' + 331 str(row.get(title_column, '')) + ' ' + 332 str(row.get(job_details_column, '')) + ' ' + 333 str(row.get(description_column, '')) 334 ).strip() 335 336 if combined_text: 337 print("Processing combined text:") 338 # Try with Ollama first 339 languages = extract_programming_languages_with_ollama(combined_text) 340 341 # If Ollama fails or returns empty, try fallback 342 if not languages: 343 print("Ollama extraction failed or returned empty. Using fallback extraction.") 344 languages = fallback_extract_languages(combined_text) 345 used_fallback[idx] = True 346 347 all_languages.append(languages) 348 extracted_languages_str[idx] = (', '.join(languages) if languages else '') 349 else: 350 print("Combined text is empty for this job") 351 all_languages.append([]) 352 extracted_languages_str[idx] = '' 353 354 print("-" * 50) 355 356 # Save checkpoint periodically 357 if (idx + 1) % checkpoint_interval == 0 or (idx + 1) == len(processing_range): 358 print(f"Creating checkpoint at index {idx}") 359 temp_df = df.copy() 360 temp_df['extracted_programming_languages_str'] = extracted_languages_str 361 temp_df['used_fallback'] = used_fallback 362 temp_df.to_csv(checkpoint_file, index=False) 363 364 # Avoid overwhelming the local Ollama service 365 time.sleep(random.uniform(0.5, 1.5)) 366 367 # Reset stdout and close log file 368 sys.stdout = original_stdout 369 log_file.close() 370 371 df['extracted_programming_languages_str'] = extracted_languages_str 372 df['used_fallback'] = used_fallback 373 374 df.to_csv(output_path, index=False) 375 print(f"\nUpdated dataset saved to {output_path}") 376 377 print("\n--- Sample Results ---") 378 print(df[[title_column, 'extracted_programming_languages_str', 'used_fallback']].head(10)) 379 380 # Print summary statistics 381 total_jobs = len(df) 382 jobs_with_languages = sum(1 for langs in extracted_languages_str if langs) 383 jobs_using_fallback = sum(used_fallback) 384 385 print(f"\n--- Summary Statistics ---") 386 print(f"Total jobs processed: {total_jobs}") 387 print(f"Jobs with extracted languages: {jobs_with_languages} ({jobs_with_languages/total_jobs*100:.1f}%)") 388 print(f"Jobs using fallback extraction: {jobs_using_fallback} ({jobs_using_fallback/total_jobs*100:.1f}%)") 389 print(f"\nDetailed logs saved to {log_path}") 390 391 except Exception as e: 392 sys.stdout = original_stdout 393 log_file.close() 394 print(f"Script error: {str(e)}") 395 import traceback 396 traceback.print_exc() 397 398if __name__ == "__main__": 399 main()

Language extraction and saving data into csv file(Source code)

Next Up

Lesson 34: Industry Classification and saving data into csv file(Source code)

Next Up

Lesson 34: Industry Classification and saving data into csv file(Source code)