AusBiz Consulting

Python

1import json
2import pandas as pd
3import requests
4import time
5
6def extract_tools_with_ollama(job_text):
7    """
8    Extract technical tools from job text using Ollama with Gemma model
9    """
10
11    if pd.isna(job_text) or job_text == "":
12        return []
13
14
15    job_text_lower = job_text.lower()
16
17
18    prompt = f"""
19    You are a specialized tool extractor for job descriptions. Your task is to identify ONLY specific named technical tools, technologies, software, programming languages, platforms, and frameworks.
20
21    STRICT RULES:
22    1. Extract ONLY proper noun names of specific technologies
23    2. Include ONLY: software names, programming languages, frameworks, platforms, databases, cloud services
24    3. Do NOT include: general skills, methodologies, concepts, or descriptive terms
25    4. Do NOT include adjectives or general terms like "analytical", "digital", "innovative"
26    5. IMPORTANT: ONLY extract tools that are EXPLICITLY mentioned in the text
27    6. If no specific tools are mentioned, return an empty array []
28
29    CORRECT EXAMPLES:
30    - INCLUDE: Python, Java, AWS, Azure, Excel, SQL, PowerBI, Tableau, Git, Docker, React, TensorFlow
31    - DO NOT INCLUDE: analytical, technical, problem-solving, digital transformation, innovative
32
33    Job Text:
34    {job_text}
35
36    Respond ONLY with a JSON array containing the list of specific technical tools found.
37    Example response format: ["Python", "AWS", "Docker"]
38    """
39
40    # Make request to local Ollama API
41    try:
42        response = requests.post(
43            "http://localhost:11434/api/generate",
44            json={
45                "model": "gemma3:1b",
46                "prompt": prompt,
47                "stream": False
48            },
49            timeout=30
50        )
51
52        if response.status_code == 200:
53            result = response.json()["response"]
54
55            # Extract JSON array from the response
56            try:
57                # Try to find JSON array in the response
58                start_idx = result.find('[')
59                end_idx = result.rfind(']') + 1
60
61                if start_idx >= 0 and end_idx > start_idx:
62                    json_str = result[start_idx:end_idx]
63                    tools = json.loads(json_str)
64
65                    # Additional filtering to remove non-tool items
66                    non_tools = ["analytical", "technical", "digital", "innovative", "advanced",
67                                "solution", "problem-solving", "communication", "teamwork",
68                                "methodology", "approach", "strategy", "skill"]
69
70                    # Filter out any items that are likely not tools
71                    filtered_tools = [tool for tool in tools
72                                     if tool.lower() not in [nt.lower() for nt in non_tools]
73                                     and len(tool) > 1  # Avoid single characters
74                                     and not tool.lower().endswith("ing")]  # Avoid gerunds like "programming"
75
76                    # CRITICAL: Verify each tool actually appears in the job text
77                    verified_tools = []
78                    for tool in filtered_tools:
79                        # Check if the tool name actually appears in the job text
80                        if tool.lower() in job_text_lower:
81                            verified_tools.append(tool)
82                        # Check for common variations/abbreviations
83                        elif tool.lower() == "javascript" and "js" in job_text_lower:
84                            verified_tools.append(tool)
85                        elif tool.lower() == "microsoft excel" and "excel" in job_text_lower:
86                            verified_tools.append("Excel")
87                        elif tool.lower() == "amazon web services" and "aws" in job_text_lower:
88                            verified_tools.append("AWS")
89
90                    print(f"Initial extraction: {tools}")
91                    print(f"After verification: {verified_tools}")
92                    return verified_tools
93                else:
94                    print(f"Couldn't find JSON array in: {result}")
95                    return []
96            except json.JSONDecodeError:
97                print(f"Failed to parse JSON from: {result}")
98                return []
99        else:
100            print(f"Error from Ollama API: {response.status_code} - {response.text}")
101            return []
102    except Exception as e:
103        print(f"Exception when calling Ollama API: {str(e)}")
104        return []
105
106def batch(lst, size):
107    """Split a list into batches of specified size"""
108    for i in range(0, len(lst), size):
109        yield lst[i:i + size]
110
111def main():
112    print("✅ Script started")
113
114    # Fix Windows file path if needed
115    file_path = "seek_jobs.csv"
116
117    print(f"Reading dataset from {file_path}...")
118    df = pd.read_csv(file_path)
119
120    print(f"📄 Data loaded: {len(df)} rows")
121
122    # Columns to extract tools from
123    title_column = "job_title"  # Column containing job titles
124    description_column = "description"
125    job_details_column = "job_details"
126    job_type_column = "job_type"
127
128    # Combine relevant columns for more context
129    df['combined_text'] = (
130        df[job_type_column].fillna('') + ' ' +
131        df[title_column].fillna('') + ' ' +
132        df[job_details_column].fillna('') + ' ' +
133        df[description_column].fillna('')
134    ).str.strip()
135
136    print(f"Total records: {len(df)}")
137    print("Starting tools extraction...\n")
138
139    all_tools = []
140
141    for idx in range(len(df)):
142        print(f"Job {idx+1}/{len(df)}:")
143
144        # Print job title first
145        if title_column in df.columns:
146            job_title = df.loc[idx, title_column]
147            if not pd.isna(job_title) and job_title != "":
148                print(f"Job Title: {job_title}")
149            else:
150                print("Job Title: [Not specified]")
151        else:
152            print("Job Title column not found in dataset")
153
154        # Get combined text for this job
155        combined_text = df.loc[idx, 'combined_text']
156
157        if not pd.isna(combined_text) and combined_text != "":
158            print(f"Processing combined text:")
159            tools = extract_tools_with_ollama(combined_text)
160
161            if tools:
162                print(f"Extracted tools: {tools}")
163                all_tools.append(tools)
164            else:
165                print("No tools extracted")
166                all_tools.append([])
167        else:
168            print("Combined text is empty for this job")
169            all_tools.append([])
170
171
172        print("-" * 50)
173
174
175        time.sleep(0.5)
176
177
178    df['extracted_tools'] = all_tools
179
180
181    df['extracted_tools_str'] = df['extracted_tools'].apply(lambda x: ', '.join(x) if x else '')
182
183    # Display results
184    print("\n--- Sample Results ---")
185    print(df[[title_column, 'extracted_tools_str']].head(10))
186
187
188
189if __name__ == "__main__":
190    main()

1import json 2import pandas as pd 3import requests 4import time 5 6def extract_tools_with_ollama(job_text): 7 """ 8 Extract technical tools from job text using Ollama with Gemma model 9 """ 10 11 if pd.isna(job_text) or job_text == "": 12 return [] 13 14 15 job_text_lower = job_text.lower() 16 17 18 prompt = f""" 19 You are a specialized tool extractor for job descriptions. Your task is to identify ONLY specific named technical tools, technologies, software, programming languages, platforms, and frameworks. 20 21 STRICT RULES: 22 1. Extract ONLY proper noun names of specific technologies 23 2. Include ONLY: software names, programming languages, frameworks, platforms, databases, cloud services 24 3. Do NOT include: general skills, methodologies, concepts, or descriptive terms 25 4. Do NOT include adjectives or general terms like "analytical", "digital", "innovative" 26 5. IMPORTANT: ONLY extract tools that are EXPLICITLY mentioned in the text 27 6. If no specific tools are mentioned, return an empty array [] 28 29 CORRECT EXAMPLES: 30 - INCLUDE: Python, Java, AWS, Azure, Excel, SQL, PowerBI, Tableau, Git, Docker, React, TensorFlow 31 - DO NOT INCLUDE: analytical, technical, problem-solving, digital transformation, innovative 32 33 Job Text: 34 {job_text} 35 36 Respond ONLY with a JSON array containing the list of specific technical tools found. 37 Example response format: ["Python", "AWS", "Docker"] 38 """ 39 40 # Make request to local Ollama API 41 try: 42 response = requests.post( 43 "http://localhost:11434/api/generate", 44 json={ 45 "model": "gemma3:1b", 46 "prompt": prompt, 47 "stream": False 48 }, 49 timeout=30 50 ) 51 52 if response.status_code == 200: 53 result = response.json()["response"] 54 55 # Extract JSON array from the response 56 try: 57 # Try to find JSON array in the response 58 start_idx = result.find('[') 59 end_idx = result.rfind(']') + 1 60 61 if start_idx >= 0 and end_idx > start_idx: 62 json_str = result[start_idx:end_idx] 63 tools = json.loads(json_str) 64 65 # Additional filtering to remove non-tool items 66 non_tools = ["analytical", "technical", "digital", "innovative", "advanced", 67 "solution", "problem-solving", "communication", "teamwork", 68 "methodology", "approach", "strategy", "skill"] 69 70 # Filter out any items that are likely not tools 71 filtered_tools = [tool for tool in tools 72 if tool.lower() not in [nt.lower() for nt in non_tools] 73 and len(tool) > 1 # Avoid single characters 74 and not tool.lower().endswith("ing")] # Avoid gerunds like "programming" 75 76 # CRITICAL: Verify each tool actually appears in the job text 77 verified_tools = [] 78 for tool in filtered_tools: 79 # Check if the tool name actually appears in the job text 80 if tool.lower() in job_text_lower: 81 verified_tools.append(tool) 82 # Check for common variations/abbreviations 83 elif tool.lower() == "javascript" and "js" in job_text_lower: 84 verified_tools.append(tool) 85 elif tool.lower() == "microsoft excel" and "excel" in job_text_lower: 86 verified_tools.append("Excel") 87 elif tool.lower() == "amazon web services" and "aws" in job_text_lower: 88 verified_tools.append("AWS") 89 90 print(f"Initial extraction: {tools}") 91 print(f"After verification: {verified_tools}") 92 return verified_tools 93 else: 94 print(f"Couldn't find JSON array in: {result}") 95 return [] 96 except json.JSONDecodeError: 97 print(f"Failed to parse JSON from: {result}") 98 return [] 99 else: 100 print(f"Error from Ollama API: {response.status_code} - {response.text}") 101 return [] 102 except Exception as e: 103 print(f"Exception when calling Ollama API: {str(e)}") 104 return [] 105 106def batch(lst, size): 107 """Split a list into batches of specified size""" 108 for i in range(0, len(lst), size): 109 yield lst[i:i + size] 110 111def main(): 112 print("✅ Script started") 113 114 # Fix Windows file path if needed 115 file_path = "seek_jobs.csv" 116 117 print(f"Reading dataset from {file_path}...") 118 df = pd.read_csv(file_path) 119 120 print(f"📄 Data loaded: {len(df)} rows") 121 122 # Columns to extract tools from 123 title_column = "job_title" # Column containing job titles 124 description_column = "description" 125 job_details_column = "job_details" 126 job_type_column = "job_type" 127 128 # Combine relevant columns for more context 129 df['combined_text'] = ( 130 df[job_type_column].fillna('') + ' ' + 131 df[title_column].fillna('') + ' ' + 132 df[job_details_column].fillna('') + ' ' + 133 df[description_column].fillna('') 134 ).str.strip() 135 136 print(f"Total records: {len(df)}") 137 print("Starting tools extraction...\n") 138 139 all_tools = [] 140 141 for idx in range(len(df)): 142 print(f"Job {idx+1}/{len(df)}:") 143 144 # Print job title first 145 if title_column in df.columns: 146 job_title = df.loc[idx, title_column] 147 if not pd.isna(job_title) and job_title != "": 148 print(f"Job Title: {job_title}") 149 else: 150 print("Job Title: [Not specified]") 151 else: 152 print("Job Title column not found in dataset") 153 154 # Get combined text for this job 155 combined_text = df.loc[idx, 'combined_text'] 156 157 if not pd.isna(combined_text) and combined_text != "": 158 print(f"Processing combined text:") 159 tools = extract_tools_with_ollama(combined_text) 160 161 if tools: 162 print(f"Extracted tools: {tools}") 163 all_tools.append(tools) 164 else: 165 print("No tools extracted") 166 all_tools.append([]) 167 else: 168 print("Combined text is empty for this job") 169 all_tools.append([]) 170 171 172 print("-" * 50) 173 174 175 time.sleep(0.5) 176 177 178 df['extracted_tools'] = all_tools 179 180 181 df['extracted_tools_str'] = df['extracted_tools'].apply(lambda x: ', '.join(x) if x else '') 182 183 # Display results 184 print("\n--- Sample Results ---") 185 print(df[[title_column, 'extracted_tools_str']].head(10)) 186 187 188 189if __name__ == "__main__": 190 main()

Tool extraction from dataset (Source code)

Next Up

Lesson 26: Soft Skills extraction from dataset(Source code)

Next Up

Lesson 26: Soft Skills extraction from dataset(Source code)