1import pandas as pd
2import matplotlib.pyplot as plt
3import re
4from sklearn.feature_extraction.text import TfidfVectorizer
5from sklearn.model_selection import train_test_split
6from sklearn.linear_model import LogisticRegression
7from sklearn.pipeline import Pipeline
8from google.colab import files
9
10
11uploaded = files.upload()
12df = pd.read_csv("seek_jobs.csv")
13
14
15df['combined_text'] = (
16 df['job_type'].fillna('') + ' ' +
17 df['job_title'].fillna('') + ' ' +
18 df['job_details'].fillna('')
19).str.lower()
20
21
22def rule_based_classify(text):
23
24 text = text.lower()
25
26 if any(term in text for term in ['day rate', '/ day', '/day', 'daily rate', 'contract role', 'short term']) and not 'fixed term' in text:
27 return 'Short Term'
28
29
30 if any(term in text for term in ['fixed term', 'ftc', 'fixed-term']):
31 return 'Fixed Term'
32
33
34 if 'full time' in text and not any(term in text for term in ['contract', 'ftc', 'fixed term', 'short term', 'day rate']):
35 return 'Permanent'
36
37
38 if 'casual' in text:
39 return 'Casual'
40
41
42 if 'contract' in text:
43 return 'Short Term'
44
45 return 'Permanent'
46
47
48df['initial_class'] = df['combined_text'].apply(rule_based_classify)
49
50
51print("\n🔍 Sample classification examples:")
52print(df[['job_title', 'initial_class']].head(10))
53
54
55df_ml = df.copy()
56df_ml = df_ml[df_ml['initial_class'].notnull()]
57
58
59X_train, X_test, y_train, y_test = train_test_split(
60 df_ml['combined_text'], df_ml['initial_class'], test_size=0.2, random_state=42
61)
62
63
64model = Pipeline([
65 ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
66 ('clf', LogisticRegression(max_iter=1000))
67])
68
69model.fit(X_train, y_train)
70
71df['job_classification'] = model.predict(df['combined_text'])
72
73
74for title, classification in zip(df['job_title'], df['job_classification']):
75 print(f"{classification:<15} | {title}")