1import json
2import pandas as pd
3import requests
4import time
5
6def extract_tools_with_ollama(job_text):
7 """
8 Extract technical tools from job text using Ollama with Gemma model
9 """
10
11 if pd.isna(job_text) or job_text == "":
12 return []
13
14
15 job_text_lower = job_text.lower()
16
17
18 prompt = f"""
19 You are a specialized tool extractor for job descriptions. Your task is to identify ONLY specific named technical tools, technologies, software, programming languages, platforms, and frameworks.
20
21 STRICT RULES:
22 1. Extract ONLY proper noun names of specific technologies
23 2. Include ONLY: software names, programming languages, frameworks, platforms, databases, cloud services
24 3. Do NOT include: general skills, methodologies, concepts, or descriptive terms
25 4. Do NOT include adjectives or general terms like "analytical", "digital", "innovative"
26 5. IMPORTANT: ONLY extract tools that are EXPLICITLY mentioned in the text
27 6. If no specific tools are mentioned, return an empty array []
28
29 CORRECT EXAMPLES:
30 - INCLUDE: Python, Java, AWS, Azure, Excel, SQL, PowerBI, Tableau, Git, Docker, React, TensorFlow
31 - DO NOT INCLUDE: analytical, technical, problem-solving, digital transformation, innovative
32
33 Job Text:
34 {job_text}
35
36 Respond ONLY with a JSON array containing the list of specific technical tools found.
37 Example response format: ["Python", "AWS", "Docker"]
38 """
39
40
41 try:
42 response = requests.post(
43 "http://localhost:11434/api/generate",
44 json={
45 "model": "gemma3:1b",
46 "prompt": prompt,
47 "stream": False
48 },
49 timeout=30
50 )
51
52 if response.status_code == 200:
53 result = response.json()["response"]
54
55
56 try:
57
58 start_idx = result.find('[')
59 end_idx = result.rfind(']') + 1
60
61 if start_idx >= 0 and end_idx > start_idx:
62 json_str = result[start_idx:end_idx]
63 tools = json.loads(json_str)
64
65
66 non_tools = ["analytical", "technical", "digital", "innovative", "advanced",
67 "solution", "problem-solving", "communication", "teamwork",
68 "methodology", "approach", "strategy", "skill"]
69
70
71 filtered_tools = [tool for tool in tools
72 if tool.lower() not in [nt.lower() for nt in non_tools]
73 and len(tool) > 1
74 and not tool.lower().endswith("ing")]
75
76
77 verified_tools = []
78 for tool in filtered_tools:
79
80 if tool.lower() in job_text_lower:
81 verified_tools.append(tool)
82
83 elif tool.lower() == "javascript" and "js" in job_text_lower:
84 verified_tools.append(tool)
85 elif tool.lower() == "microsoft excel" and "excel" in job_text_lower:
86 verified_tools.append("Excel")
87 elif tool.lower() == "amazon web services" and "aws" in job_text_lower:
88 verified_tools.append("AWS")
89
90 print(f"Initial extraction: {tools}")
91 print(f"After verification: {verified_tools}")
92 return verified_tools
93 else:
94 print(f"Couldn't find JSON array in: {result}")
95 return []
96 except json.JSONDecodeError:
97 print(f"Failed to parse JSON from: {result}")
98 return []
99 else:
100 print(f"Error from Ollama API: {response.status_code} - {response.text}")
101 return []
102 except Exception as e:
103 print(f"Exception when calling Ollama API: {str(e)}")
104 return []
105
106def batch(lst, size):
107 """Split a list into batches of specified size"""
108 for i in range(0, len(lst), size):
109 yield lst[i:i + size]
110
111def main():
112 print("✅ Script started")
113
114
115 file_path = "seek_jobs.csv"
116
117 print(f"Reading dataset from {file_path}...")
118 df = pd.read_csv(file_path)
119
120 print(f"📄 Data loaded: {len(df)} rows")
121
122
123 title_column = "job_title"
124 description_column = "description"
125 job_details_column = "job_details"
126 job_type_column = "job_type"
127
128
129 df['combined_text'] = (
130 df[job_type_column].fillna('') + ' ' +
131 df[title_column].fillna('') + ' ' +
132 df[job_details_column].fillna('') + ' ' +
133 df[description_column].fillna('')
134 ).str.strip()
135
136 print(f"Total records: {len(df)}")
137 print("Starting tools extraction...\n")
138
139 all_tools = []
140
141 for idx in range(len(df)):
142 print(f"Job {idx+1}/{len(df)}:")
143
144
145 if title_column in df.columns:
146 job_title = df.loc[idx, title_column]
147 if not pd.isna(job_title) and job_title != "":
148 print(f"Job Title: {job_title}")
149 else:
150 print("Job Title: [Not specified]")
151 else:
152 print("Job Title column not found in dataset")
153
154
155 combined_text = df.loc[idx, 'combined_text']
156
157 if not pd.isna(combined_text) and combined_text != "":
158 print(f"Processing combined text:")
159 tools = extract_tools_with_ollama(combined_text)
160
161 if tools:
162 print(f"Extracted tools: {tools}")
163 all_tools.append(tools)
164 else:
165 print("No tools extracted")
166 all_tools.append([])
167 else:
168 print("Combined text is empty for this job")
169 all_tools.append([])
170
171
172 print("-" * 50)
173
174
175 time.sleep(0.5)
176
177
178 df['extracted_tools'] = all_tools
179
180
181 df['extracted_tools_str'] = df['extracted_tools'].apply(lambda x: ', '.join(x) if x else '')
182
183
184 print("\n--- Sample Results ---")
185 print(df[[title_column, 'extracted_tools_str']].head(10))
186
187
188
189if __name__ == "__main__":
190 main()