AusBiz Consulting

Python

1import json
2import pandas as pd
3import requests
4import time
5
6def extract_programming_languages_with_ollama(job_description):
7    """
8    Extract programming languages from job description using Ollama with Gemma model
9    """
10    # Handle missing or NaN values
11    if pd.isna(job_description) or job_description == "":
12        return []
13    
14    job_description_lower = job_description.lower()
15    
16    
17    prompt = f"""
18    You are a specialized programming languages extractor for job descriptions. Your task is to identify ONLY programming, scripting, and markup languages mentioned in the job description.
19
20    STRICT RULES:
21    1. Extract ONLY programming, scripting, and markup languages
22    2. Include ONLY: languages like Python, Java, JavaScript, TypeScript, C#, C++, Ruby, PHP, Swift, Kotlin, Go, SQL, HTML, CSS, R, Scala, Rust, MATLAB, Perl, Bash, PowerShell, etc.
23    3. Do NOT include: frameworks (React, Angular), libraries (jQuery, TensorFlow), tools (Git, Docker), platforms (AWS, Azure), or methodologies (Agile, Scrum)
24    4. Do NOT include job titles, responsibilities, or soft skills
25    5. IMPORTANT: ONLY extract languages that are EXPLICITLY mentioned in the text
26    6. If no programming languages are mentioned, return an empty array []
27    7. Recognize language variations (e.g., "JS" = "JavaScript", "TS" = "TypeScript")
28
29    CORRECT EXAMPLES:
30    - INCLUDE: "Python", "Java", "C#", "JavaScript", "TypeScript", "SQL", "HTML", "CSS"
31    - DO NOT INCLUDE: "React", "Angular", "Node.js", "Django", "Git", "AWS", "Agile"
32
33    Job Description:
34    {job_description}
35
36    Respond ONLY with a JSON array containing the list of programming languages found.
37    Example response format: ["Python", "JavaScript", "SQL"]
38    """
39    
40    # Make request to local Ollama API
41    try:
42        response = requests.post(
43            "http://localhost:11434/api/generate",
44            json={
45                "model": "gemma3:1b",
46                "prompt": prompt,
47                "stream": False
48            },
49            timeout=30
50        )
51        
52        if response.status_code == 200:
53            result = response.json()["response"]
54            
55            # Extract JSON array from the response
56            try:
57                # Try to find JSON array in the response
58                start_idx = result.find('[')
59                end_idx = result.rfind(']') + 1
60                
61                if start_idx >= 0 and end_idx > start_idx:
62                    json_str = result[start_idx:end_idx]
63                    languages = json.loads(json_str)
64                    
65                    # Additional filtering to remove non-language items
66                    non_languages = ["react", "angular", "node.js", "django", "flask", "spring", "express", 
67                                    "vue", "scrum", "agile", "aws", "azure", "docker", "kubernetes", 
68                                    "git", "jenkins", "jira", "agile", "scrum", "kanban", "teamwork"]
69                    
70                    # Filter out any items that are likely not programming languages
71                    filtered_languages = [lang for lang in languages 
72                                         if lang.lower() not in [nl.lower() for nl in non_languages]
73                                         and len(lang) > 1]  # Avoid single characters except for specific cases like R
74                    
75                    # Special case for single character languages like R
76                    if "R" in languages and "R" not in filtered_languages:
77                        filtered_languages.append("R")
78                    
79                    # CRITICAL: Verify each language actually appears in the job description
80                    verified_languages = []
81                    
82                    # Common variations and synonyms for programming languages
83                    language_synonyms = {
84                        "javascript": ["js", "javascript", "ecmascript"],
85                        "typescript": ["ts", "typescript"],
86                        "python": ["py", "python3", "python 3", "python2", "python 2"],
87                        "java": ["java", "jvm"],
88                        "c#": ["c#", "csharp", "c sharp"],
89                        "c++": ["c++", "cpp", "cplusplus", "c plus plus"],
90                        "ruby": ["rb", "ruby on rails", "rails"],
91                        "php": ["php"],
92                        "swift": ["swift"],
93                        "kotlin": ["kt", "kotlin"],
94                        "go": ["golang", "go language"],
95                        "sql": ["sql", "tsql", "plsql", "mysql", "postgresql", "oracle sql", "sql server"],
96                        "html": ["html", "html5", "html 5"],
97                        "css": ["css", "css3", "css 3", "scss", "sass"],
98                        "r": ["r programming", "r language"],
99                        "scala": ["scala"],
100                        "rust": ["rust", "rust lang"],
101                        "matlab": ["matlab"],
102                        "perl": ["perl"],
103                        "bash": ["bash", "shell", "shell script", "shell scripting"],
104                        "powershell": ["powershell", "power shell"],
105                        "vba": ["visual basic", "visual basic for applications"],
106                        "groovy": ["groovy"],
107                        "fortran": ["fortran"]
108                    }
109                    
110                    for language in filtered_languages:
111                        language_lower = language.lower()
112                        
113                        # Check if the language directly appears in the job description
114                        if language_lower in job_description_lower:
115                            verified_languages.append(language)
116                            continue
117                        
118                        # Check for synonyms and variations
119                        for main_lang, synonyms in language_synonyms.items():
120                            if language_lower == main_lang or language_lower in synonyms:
121                                # Check if any synonyms appear in the text
122                                if any(syn in job_description_lower for syn in synonyms) or main_lang in job_description_lower:
123                                    # Standardize the language name with proper capitalization
124                                    if main_lang == "javascript":
125                                        verified_languages.append("JavaScript")
126                                    elif main_lang == "typescript":
127                                        verified_languages.append("TypeScript")
128                                    elif main_lang == "c#":
129                                        verified_languages.append("C#")
130                                    elif main_lang == "c++":
131                                        verified_languages.append("C++")
132                                    elif main_lang == "php":
133                                        verified_languages.append("PHP")
134                                    elif main_lang == "sql":
135                                        verified_languages.append("SQL")
136                                    elif main_lang == "html":
137                                        verified_languages.append("HTML")
138                                    elif main_lang == "css":
139                                        verified_languages.append("CSS")
140                                    elif main_lang == "r":
141                                        verified_languages.append("R")
142                                    elif main_lang == "vba":
143                                        verified_languages.append("VBA")
144                                    else:
145                                        verified_languages.append(main_lang.title())  # Use standardized version
146                                    break
147                    
148                    # Handle special case for "C" language to avoid false positives
149                    if "C" in languages and not any(lang in verified_languages for lang in ["C++", "C#"]):
150                        # Verify it's actually the C language with context patterns
151                        c_language_patterns = ["c programming", "c language", "ansi c", 
152                                              "programming in c", "code in c", "c developer"]
153                        if any(pattern in job_description_lower for pattern in c_language_patterns):
154                            verified_languages.append("C")
155                    
156                    print(f"Initial extraction: {languages}")
157                    print(f"After verification: {verified_languages}")
158                    return verified_languages
159                else:
160                    print(f"Couldn't find JSON array in: {result}")
161                    return []
162            except json.JSONDecodeError:
163                print(f"Failed to parse JSON from: {result}")
164                return []
165        else:
166            print(f"Error from Ollama API: {response.status_code} - {response.text}")
167            return []
168    except Exception as e:
169        print(f"Exception when calling Ollama API: {str(e)}")
170        return []
171
172def main():
173    file_path = "seek_jobs.csv"  
174    
175    # Columns to extract programming languages from
176    title_column = "job_title"  # Column containing job titles
177    description_column = "description"
178    job_details_column = "job_details"
179    
180    print(f"Reading dataset from {file_path}...")
181    df = pd.read_csv(file_path)
182    
183    print(f"Total records: {len(df)}")
184    print("Starting programming languages extraction...\n")
185    
186    # Prepare to store all extracted programming languages
187    all_languages = []
188    
189    for idx in range(len(df)):
190        print(f"Job {idx+1}/{len(df)}:")
191        job_languages = []
192        
193        # Print job title first
194        if title_column in df.columns:
195            job_title = df.loc[idx, title_column]
196            if not pd.isna(job_title) and job_title != "":
197                print(f"Job Title: {job_title}")
198            else:
199                print("Job Title: [Not specified]")
200        else:
201            print("Job Title column not found in dataset")
202        
203        # Process description column
204        if description_column in df.columns:
205            description = df.loc[idx, description_column]
206            if not pd.isna(description) and description != "":
207                print(f"Processing from description column:")
208                langs_desc = extract_programming_languages_with_ollama(description)
209                
210                if langs_desc:
211                    print(f"Extracted programming languages from description: {langs_desc}")
212                    job_languages.extend(langs_desc)
213                else:
214                    print("No programming languages extracted from description")
215            else:
216                print("Description column is empty for this job")
217        
218        # Process job_details column
219        if job_details_column in df.columns:
220            job_details = df.loc[idx, job_details_column]
221            if not pd.isna(job_details) and job_details != "":
222                print(f"Processing from job_details column:")
223                langs_details = extract_programming_languages_with_ollama(job_details)
224                
225                if langs_details:
226                    print(f"Extracted programming languages from job_details: {langs_details}")
227                    job_languages.extend(langs_details)
228                else:
229                    print("No programming languages extracted from job_details")
230            else:
231                print("Job details column is empty for this job")
232        
233        # Combine unique programming languages from both columns
234        if job_languages:
235            unique_languages = list(set(job_languages))
236            print(f"Combined unique programming languages: {unique_languages}")
237            all_languages.append(unique_languages)
238        else:
239            all_languages.append([])
240        
241        print("-" * 50)
242        
243        time.sleep(0.5)
244    
245    df['extracted_programming_languages'] = all_languages
246    
247    
248    df['programming_languages_str'] = df['extracted_programming_languages'].apply(lambda x: ', '.join(x) if x else '')
249    
250
251if __name__ == "__main__":
252    main()

1import json 2import pandas as pd 3import requests 4import time 5 6def extract_programming_languages_with_ollama(job_description): 7 """ 8 Extract programming languages from job description using Ollama with Gemma model 9 """ 10 # Handle missing or NaN values 11 if pd.isna(job_description) or job_description == "": 12 return [] 13 14 job_description_lower = job_description.lower() 15 16 17 prompt = f""" 18 You are a specialized programming languages extractor for job descriptions. Your task is to identify ONLY programming, scripting, and markup languages mentioned in the job description. 19 20 STRICT RULES: 21 1. Extract ONLY programming, scripting, and markup languages 22 2. Include ONLY: languages like Python, Java, JavaScript, TypeScript, C#, C++, Ruby, PHP, Swift, Kotlin, Go, SQL, HTML, CSS, R, Scala, Rust, MATLAB, Perl, Bash, PowerShell, etc. 23 3. Do NOT include: frameworks (React, Angular), libraries (jQuery, TensorFlow), tools (Git, Docker), platforms (AWS, Azure), or methodologies (Agile, Scrum) 24 4. Do NOT include job titles, responsibilities, or soft skills 25 5. IMPORTANT: ONLY extract languages that are EXPLICITLY mentioned in the text 26 6. If no programming languages are mentioned, return an empty array [] 27 7. Recognize language variations (e.g., "JS" = "JavaScript", "TS" = "TypeScript") 28 29 CORRECT EXAMPLES: 30 - INCLUDE: "Python", "Java", "C#", "JavaScript", "TypeScript", "SQL", "HTML", "CSS" 31 - DO NOT INCLUDE: "React", "Angular", "Node.js", "Django", "Git", "AWS", "Agile" 32 33 Job Description: 34 {job_description} 35 36 Respond ONLY with a JSON array containing the list of programming languages found. 37 Example response format: ["Python", "JavaScript", "SQL"] 38 """ 39 40 # Make request to local Ollama API 41 try: 42 response = requests.post( 43 "http://localhost:11434/api/generate", 44 json={ 45 "model": "gemma3:1b", 46 "prompt": prompt, 47 "stream": False 48 }, 49 timeout=30 50 ) 51 52 if response.status_code == 200: 53 result = response.json()["response"] 54 55 # Extract JSON array from the response 56 try: 57 # Try to find JSON array in the response 58 start_idx = result.find('[') 59 end_idx = result.rfind(']') + 1 60 61 if start_idx >= 0 and end_idx > start_idx: 62 json_str = result[start_idx:end_idx] 63 languages = json.loads(json_str) 64 65 # Additional filtering to remove non-language items 66 non_languages = ["react", "angular", "node.js", "django", "flask", "spring", "express", 67 "vue", "scrum", "agile", "aws", "azure", "docker", "kubernetes", 68 "git", "jenkins", "jira", "agile", "scrum", "kanban", "teamwork"] 69 70 # Filter out any items that are likely not programming languages 71 filtered_languages = [lang for lang in languages 72 if lang.lower() not in [nl.lower() for nl in non_languages] 73 and len(lang) > 1] # Avoid single characters except for specific cases like R 74 75 # Special case for single character languages like R 76 if "R" in languages and "R" not in filtered_languages: 77 filtered_languages.append("R") 78 79 # CRITICAL: Verify each language actually appears in the job description 80 verified_languages = [] 81 82 # Common variations and synonyms for programming languages 83 language_synonyms = { 84 "javascript": ["js", "javascript", "ecmascript"], 85 "typescript": ["ts", "typescript"], 86 "python": ["py", "python3", "python 3", "python2", "python 2"], 87 "java": ["java", "jvm"], 88 "c#": ["c#", "csharp", "c sharp"], 89 "c++": ["c++", "cpp", "cplusplus", "c plus plus"], 90 "ruby": ["rb", "ruby on rails", "rails"], 91 "php": ["php"], 92 "swift": ["swift"], 93 "kotlin": ["kt", "kotlin"], 94 "go": ["golang", "go language"], 95 "sql": ["sql", "tsql", "plsql", "mysql", "postgresql", "oracle sql", "sql server"], 96 "html": ["html", "html5", "html 5"], 97 "css": ["css", "css3", "css 3", "scss", "sass"], 98 "r": ["r programming", "r language"], 99 "scala": ["scala"], 100 "rust": ["rust", "rust lang"], 101 "matlab": ["matlab"], 102 "perl": ["perl"], 103 "bash": ["bash", "shell", "shell script", "shell scripting"], 104 "powershell": ["powershell", "power shell"], 105 "vba": ["visual basic", "visual basic for applications"], 106 "groovy": ["groovy"], 107 "fortran": ["fortran"] 108 } 109 110 for language in filtered_languages: 111 language_lower = language.lower() 112 113 # Check if the language directly appears in the job description 114 if language_lower in job_description_lower: 115 verified_languages.append(language) 116 continue 117 118 # Check for synonyms and variations 119 for main_lang, synonyms in language_synonyms.items(): 120 if language_lower == main_lang or language_lower in synonyms: 121 # Check if any synonyms appear in the text 122 if any(syn in job_description_lower for syn in synonyms) or main_lang in job_description_lower: 123 # Standardize the language name with proper capitalization 124 if main_lang == "javascript": 125 verified_languages.append("JavaScript") 126 elif main_lang == "typescript": 127 verified_languages.append("TypeScript") 128 elif main_lang == "c#": 129 verified_languages.append("C#") 130 elif main_lang == "c++": 131 verified_languages.append("C++") 132 elif main_lang == "php": 133 verified_languages.append("PHP") 134 elif main_lang == "sql": 135 verified_languages.append("SQL") 136 elif main_lang == "html": 137 verified_languages.append("HTML") 138 elif main_lang == "css": 139 verified_languages.append("CSS") 140 elif main_lang == "r": 141 verified_languages.append("R") 142 elif main_lang == "vba": 143 verified_languages.append("VBA") 144 else: 145 verified_languages.append(main_lang.title()) # Use standardized version 146 break 147 148 # Handle special case for "C" language to avoid false positives 149 if "C" in languages and not any(lang in verified_languages for lang in ["C++", "C#"]): 150 # Verify it's actually the C language with context patterns 151 c_language_patterns = ["c programming", "c language", "ansi c", 152 "programming in c", "code in c", "c developer"] 153 if any(pattern in job_description_lower for pattern in c_language_patterns): 154 verified_languages.append("C") 155 156 print(f"Initial extraction: {languages}") 157 print(f"After verification: {verified_languages}") 158 return verified_languages 159 else: 160 print(f"Couldn't find JSON array in: {result}") 161 return [] 162 except json.JSONDecodeError: 163 print(f"Failed to parse JSON from: {result}") 164 return [] 165 else: 166 print(f"Error from Ollama API: {response.status_code} - {response.text}") 167 return [] 168 except Exception as e: 169 print(f"Exception when calling Ollama API: {str(e)}") 170 return [] 171 172def main(): 173 file_path = "seek_jobs.csv" 174 175 # Columns to extract programming languages from 176 title_column = "job_title" # Column containing job titles 177 description_column = "description" 178 job_details_column = "job_details" 179 180 print(f"Reading dataset from {file_path}...") 181 df = pd.read_csv(file_path) 182 183 print(f"Total records: {len(df)}") 184 print("Starting programming languages extraction...\n") 185 186 # Prepare to store all extracted programming languages 187 all_languages = [] 188 189 for idx in range(len(df)): 190 print(f"Job {idx+1}/{len(df)}:") 191 job_languages = [] 192 193 # Print job title first 194 if title_column in df.columns: 195 job_title = df.loc[idx, title_column] 196 if not pd.isna(job_title) and job_title != "": 197 print(f"Job Title: {job_title}") 198 else: 199 print("Job Title: [Not specified]") 200 else: 201 print("Job Title column not found in dataset") 202 203 # Process description column 204 if description_column in df.columns: 205 description = df.loc[idx, description_column] 206 if not pd.isna(description) and description != "": 207 print(f"Processing from description column:") 208 langs_desc = extract_programming_languages_with_ollama(description) 209 210 if langs_desc: 211 print(f"Extracted programming languages from description: {langs_desc}") 212 job_languages.extend(langs_desc) 213 else: 214 print("No programming languages extracted from description") 215 else: 216 print("Description column is empty for this job") 217 218 # Process job_details column 219 if job_details_column in df.columns: 220 job_details = df.loc[idx, job_details_column] 221 if not pd.isna(job_details) and job_details != "": 222 print(f"Processing from job_details column:") 223 langs_details = extract_programming_languages_with_ollama(job_details) 224 225 if langs_details: 226 print(f"Extracted programming languages from job_details: {langs_details}") 227 job_languages.extend(langs_details) 228 else: 229 print("No programming languages extracted from job_details") 230 else: 231 print("Job details column is empty for this job") 232 233 # Combine unique programming languages from both columns 234 if job_languages: 235 unique_languages = list(set(job_languages)) 236 print(f"Combined unique programming languages: {unique_languages}") 237 all_languages.append(unique_languages) 238 else: 239 all_languages.append([]) 240 241 print("-" * 50) 242 243 time.sleep(0.5) 244 245 df['extracted_programming_languages'] = all_languages 246 247 248 df['programming_languages_str'] = df['extracted_programming_languages'].apply(lambda x: ', '.join(x) if x else '') 249 250 251if __name__ == "__main__": 252 main()

Language extraction from dataset (Source code)

Next Up

Lesson 29: Industry Classification from dataset (Source code)

Next Up

Lesson 29: Industry Classification from dataset (Source code)