1import json
2import pandas as pd
3import requests
4import time
5
6def extract_programming_languages_with_ollama(job_description):
7 """
8 Extract programming languages from job description using Ollama with Gemma model
9 """
10
11 if pd.isna(job_description) or job_description == "":
12 return []
13
14 job_description_lower = job_description.lower()
15
16
17 prompt = f"""
18 You are a specialized programming languages extractor for job descriptions. Your task is to identify ONLY programming, scripting, and markup languages mentioned in the job description.
19
20 STRICT RULES:
21 1. Extract ONLY programming, scripting, and markup languages
22 2. Include ONLY: languages like Python, Java, JavaScript, TypeScript, C#, C++, Ruby, PHP, Swift, Kotlin, Go, SQL, HTML, CSS, R, Scala, Rust, MATLAB, Perl, Bash, PowerShell, etc.
23 3. Do NOT include: frameworks (React, Angular), libraries (jQuery, TensorFlow), tools (Git, Docker), platforms (AWS, Azure), or methodologies (Agile, Scrum)
24 4. Do NOT include job titles, responsibilities, or soft skills
25 5. IMPORTANT: ONLY extract languages that are EXPLICITLY mentioned in the text
26 6. If no programming languages are mentioned, return an empty array []
27 7. Recognize language variations (e.g., "JS" = "JavaScript", "TS" = "TypeScript")
28
29 CORRECT EXAMPLES:
30 - INCLUDE: "Python", "Java", "C#", "JavaScript", "TypeScript", "SQL", "HTML", "CSS"
31 - DO NOT INCLUDE: "React", "Angular", "Node.js", "Django", "Git", "AWS", "Agile"
32
33 Job Description:
34 {job_description}
35
36 Respond ONLY with a JSON array containing the list of programming languages found.
37 Example response format: ["Python", "JavaScript", "SQL"]
38 """
39
40
41 try:
42 response = requests.post(
43 "http://localhost:11434/api/generate",
44 json={
45 "model": "gemma3:1b",
46 "prompt": prompt,
47 "stream": False
48 },
49 timeout=30
50 )
51
52 if response.status_code == 200:
53 result = response.json()["response"]
54
55
56 try:
57
58 start_idx = result.find('[')
59 end_idx = result.rfind(']') + 1
60
61 if start_idx >= 0 and end_idx > start_idx:
62 json_str = result[start_idx:end_idx]
63 languages = json.loads(json_str)
64
65
66 non_languages = ["react", "angular", "node.js", "django", "flask", "spring", "express",
67 "vue", "scrum", "agile", "aws", "azure", "docker", "kubernetes",
68 "git", "jenkins", "jira", "agile", "scrum", "kanban", "teamwork"]
69
70
71 filtered_languages = [lang for lang in languages
72 if lang.lower() not in [nl.lower() for nl in non_languages]
73 and len(lang) > 1]
74
75
76 if "R" in languages and "R" not in filtered_languages:
77 filtered_languages.append("R")
78
79
80 verified_languages = []
81
82
83 language_synonyms = {
84 "javascript": ["js", "javascript", "ecmascript"],
85 "typescript": ["ts", "typescript"],
86 "python": ["py", "python3", "python 3", "python2", "python 2"],
87 "java": ["java", "jvm"],
88 "c#": ["c#", "csharp", "c sharp"],
89 "c++": ["c++", "cpp", "cplusplus", "c plus plus"],
90 "ruby": ["rb", "ruby on rails", "rails"],
91 "php": ["php"],
92 "swift": ["swift"],
93 "kotlin": ["kt", "kotlin"],
94 "go": ["golang", "go language"],
95 "sql": ["sql", "tsql", "plsql", "mysql", "postgresql", "oracle sql", "sql server"],
96 "html": ["html", "html5", "html 5"],
97 "css": ["css", "css3", "css 3", "scss", "sass"],
98 "r": ["r programming", "r language"],
99 "scala": ["scala"],
100 "rust": ["rust", "rust lang"],
101 "matlab": ["matlab"],
102 "perl": ["perl"],
103 "bash": ["bash", "shell", "shell script", "shell scripting"],
104 "powershell": ["powershell", "power shell"],
105 "vba": ["visual basic", "visual basic for applications"],
106 "groovy": ["groovy"],
107 "fortran": ["fortran"]
108 }
109
110 for language in filtered_languages:
111 language_lower = language.lower()
112
113
114 if language_lower in job_description_lower:
115 verified_languages.append(language)
116 continue
117
118
119 for main_lang, synonyms in language_synonyms.items():
120 if language_lower == main_lang or language_lower in synonyms:
121
122 if any(syn in job_description_lower for syn in synonyms) or main_lang in job_description_lower:
123
124 if main_lang == "javascript":
125 verified_languages.append("JavaScript")
126 elif main_lang == "typescript":
127 verified_languages.append("TypeScript")
128 elif main_lang == "c#":
129 verified_languages.append("C#")
130 elif main_lang == "c++":
131 verified_languages.append("C++")
132 elif main_lang == "php":
133 verified_languages.append("PHP")
134 elif main_lang == "sql":
135 verified_languages.append("SQL")
136 elif main_lang == "html":
137 verified_languages.append("HTML")
138 elif main_lang == "css":
139 verified_languages.append("CSS")
140 elif main_lang == "r":
141 verified_languages.append("R")
142 elif main_lang == "vba":
143 verified_languages.append("VBA")
144 else:
145 verified_languages.append(main_lang.title())
146 break
147
148
149 if "C" in languages and not any(lang in verified_languages for lang in ["C++", "C#"]):
150
151 c_language_patterns = ["c programming", "c language", "ansi c",
152 "programming in c", "code in c", "c developer"]
153 if any(pattern in job_description_lower for pattern in c_language_patterns):
154 verified_languages.append("C")
155
156 print(f"Initial extraction: {languages}")
157 print(f"After verification: {verified_languages}")
158 return verified_languages
159 else:
160 print(f"Couldn't find JSON array in: {result}")
161 return []
162 except json.JSONDecodeError:
163 print(f"Failed to parse JSON from: {result}")
164 return []
165 else:
166 print(f"Error from Ollama API: {response.status_code} - {response.text}")
167 return []
168 except Exception as e:
169 print(f"Exception when calling Ollama API: {str(e)}")
170 return []
171
172def main():
173 file_path = "seek_jobs.csv"
174
175
176 title_column = "job_title"
177 description_column = "description"
178 job_details_column = "job_details"
179
180 print(f"Reading dataset from {file_path}...")
181 df = pd.read_csv(file_path)
182
183 print(f"Total records: {len(df)}")
184 print("Starting programming languages extraction...\n")
185
186
187 all_languages = []
188
189 for idx in range(len(df)):
190 print(f"Job {idx+1}/{len(df)}:")
191 job_languages = []
192
193
194 if title_column in df.columns:
195 job_title = df.loc[idx, title_column]
196 if not pd.isna(job_title) and job_title != "":
197 print(f"Job Title: {job_title}")
198 else:
199 print("Job Title: [Not specified]")
200 else:
201 print("Job Title column not found in dataset")
202
203
204 if description_column in df.columns:
205 description = df.loc[idx, description_column]
206 if not pd.isna(description) and description != "":
207 print(f"Processing from description column:")
208 langs_desc = extract_programming_languages_with_ollama(description)
209
210 if langs_desc:
211 print(f"Extracted programming languages from description: {langs_desc}")
212 job_languages.extend(langs_desc)
213 else:
214 print("No programming languages extracted from description")
215 else:
216 print("Description column is empty for this job")
217
218
219 if job_details_column in df.columns:
220 job_details = df.loc[idx, job_details_column]
221 if not pd.isna(job_details) and job_details != "":
222 print(f"Processing from job_details column:")
223 langs_details = extract_programming_languages_with_ollama(job_details)
224
225 if langs_details:
226 print(f"Extracted programming languages from job_details: {langs_details}")
227 job_languages.extend(langs_details)
228 else:
229 print("No programming languages extracted from job_details")
230 else:
231 print("Job details column is empty for this job")
232
233
234 if job_languages:
235 unique_languages = list(set(job_languages))
236 print(f"Combined unique programming languages: {unique_languages}")
237 all_languages.append(unique_languages)
238 else:
239 all_languages.append([])
240
241 print("-" * 50)
242
243 time.sleep(0.5)
244
245 df['extracted_programming_languages'] = all_languages
246
247
248 df['programming_languages_str'] = df['extracted_programming_languages'].apply(lambda x: ', '.join(x) if x else '')
249
250
251if __name__ == "__main__":
252 main()