1import json
2import pandas as pd
3import requests
4import time
5
6def extract_soft_skills_with_ollama(job_description):
7 """
8 Extract soft skills from job description using Ollama with Gemma model
9 """
10
11 if pd.isna(job_description) or job_description == "":
12 return []
13
14
15 job_description_lower = job_description.lower()
16
17
18 prompt = f"""
19 You are a specialized soft skills extractor for job descriptions. Your task is to identify ONLY soft skills, interpersonal abilities, and personal attributes mentioned in the job description.
20
21 STRICT RULES:
22 1. Extract ONLY soft skills and interpersonal abilities
23 2. Include ONLY: communication skills, leadership qualities, interpersonal traits, personal attributes
24 3. Do NOT include: technical skills, tools, programming languages, or job requirements
25 4. Do NOT include job titles, responsibilities, or education requirements
26 5. IMPORTANT: ONLY extract soft skills that are EXPLICITLY mentioned in the text
27 6. If no soft skills are mentioned, return an empty array []
28
29 CORRECT EXAMPLES:
30 - INCLUDE: communication, teamwork, leadership, problem-solving, adaptability, time management, creativity, critical thinking
31 - DO NOT INCLUDE: Python, project management (as a role), BA degree, Excel, analytics (as a field)
32
33 Job Description:
34 {job_description}
35
36 Respond ONLY with a JSON array containing the list of soft skills found.
37 Example response format: ["Communication", "Teamwork", "Leadership"]
38 """
39
40
41 try:
42 response = requests.post(
43 "http://localhost:11434/api/generate",
44 json={
45 "model": "gemma3:1b",
46 "prompt": prompt,
47 "stream": False
48 },
49 timeout=30
50 )
51
52 if response.status_code == 200:
53 result = response.json()["response"]
54
55
56 try:
57
58 start_idx = result.find('[')
59 end_idx = result.rfind(']') + 1
60
61 if start_idx >= 0 and end_idx > start_idx:
62 json_str = result[start_idx:end_idx]
63 soft_skills = json.loads(json_str)
64
65
66 non_soft_skills = ["python", "java", "excel", "sql", "aws", "azure", "tableau",
67 "powerbi", "data", "analytics", "database", "programming",
68 "degree", "bachelor", "master", "certification", "license"]
69
70
71 filtered_skills = [skill for skill in soft_skills
72 if skill.lower() not in [ns.lower() for ns in non_soft_skills]
73 and len(skill) > 1]
74
75
76 verified_skills = []
77
78
79 skill_synonyms = {
80 "communication": ["communicate", "verbal", "written communication", "articulate", "presenting"],
81 "leadership": ["lead", "leading", "leader", "motivate", "influence"],
82 "teamwork": ["team player", "collaborate", "collaboration", "team-oriented", "team work"],
83 "adaptability": ["adapt", "flexible", "flexibility", "versatile", "versatility"],
84 "problem-solving": ["solve", "problem solver", "analytical thinking", "solution-oriented", "troubleshooting"],
85 "time management": ["prioritize", "prioritization", "deadlines", "punctual", "time-conscious"],
86 "creativity": ["creative", "innovative", "innovation", "creative thinking", "think outside the box"],
87 "critical thinking": ["analytical", "analyze", "critical", "logical", "reasoning"]
88 }
89
90 for skill in filtered_skills:
91 skill_lower = skill.lower()
92
93
94 if skill_lower in job_description_lower:
95 verified_skills.append(skill)
96 continue
97
98
99 for main_skill, synonyms in skill_synonyms.items():
100 if skill_lower == main_skill or skill_lower in synonyms:
101
102 if any(syn in job_description_lower for syn in synonyms) or main_skill in job_description_lower:
103 verified_skills.append(main_skill.title())
104 break
105
106 print(f"Initial extraction: {soft_skills}")
107 print(f"After verification: {verified_skills}")
108 return verified_skills
109 else:
110 print(f"Couldn't find JSON array in: {result}")
111 return []
112 except json.JSONDecodeError:
113 print(f"Failed to parse JSON from: {result}")
114 return []
115 else:
116 print(f"Error from Ollama API: {response.status_code} - {response.text}")
117 return []
118 except Exception as e:
119 print(f"Exception when calling Ollama API: {str(e)}")
120 return []
121
122def main():
123 file_path = "seek_jobs.csv"
124
125
126 title_column = "job_title"
127 description_column = "description"
128 job_details_column = "job_details"
129
130 print(f"Reading dataset from {file_path}...")
131 df = pd.read_csv(file_path)
132
133 print(f"Total records: {len(df)}")
134 print("Starting soft skills extraction...\n")
135
136
137 all_soft_skills = []
138
139 for idx in range(len(df)):
140 print(f"Job {idx+1}/{len(df)}:")
141 job_soft_skills = []
142
143
144 if title_column in df.columns:
145 job_title = df.loc[idx, title_column]
146 if not pd.isna(job_title) and job_title != "":
147 print(f"Job Title: {job_title}")
148 else:
149 print("Job Title: [Not specified]")
150 else:
151 print("Job Title column not found in dataset")
152
153
154 if description_column in df.columns:
155 description = df.loc[idx, description_column]
156 if not pd.isna(description) and description != "":
157 print(f"Processing from description column:")
158 soft_skills_desc = extract_soft_skills_with_ollama(description)
159
160 if soft_skills_desc:
161 print(f"Extracted soft skills from description: {soft_skills_desc}")
162 job_soft_skills.extend(soft_skills_desc)
163 else:
164 print("No soft skills extracted from description")
165 else:
166 print("Description column is empty for this job")
167
168
169 if job_details_column in df.columns:
170 job_details = df.loc[idx, job_details_column]
171 if not pd.isna(job_details) and job_details != "":
172 print(f"Processing from job_details column:")
173 soft_skills_details = extract_soft_skills_with_ollama(job_details)
174
175 if soft_skills_details:
176 print(f"Extracted soft skills from job_details: {soft_skills_details}")
177 job_soft_skills.extend(soft_skills_details)
178 else:
179 print("No soft skills extracted from job_details")
180 else:
181 print("Job details column is empty for this job")
182
183
184 if job_soft_skills:
185 unique_skills = list(set(job_soft_skills))
186 print(f"Combined unique soft skills: {unique_skills}")
187 all_soft_skills.append(unique_skills)
188 else:
189 all_soft_skills.append([])
190
191
192 print("-" * 50)
193
194
195 time.sleep(0.5)
196
197
198 df['extracted_soft_skills'] = all_soft_skills
199
200
201 df['soft_skills_str'] = df['extracted_soft_skills'].apply(lambda x: ', '.join(x) if x else '')
202
203
204
205if __name__ == "__main__":
206 main()