AusBiz Consulting

Python

1import pandas as pd
2import matplotlib.pyplot as plt
3import re
4from sklearn.feature_extraction.text import TfidfVectorizer
5from sklearn.model_selection import train_test_split
6from sklearn.linear_model import LogisticRegression
7from sklearn.pipeline import Pipeline
8from google.colab import files
9
10
11uploaded = files.upload()
12df = pd.read_csv("seek_jobs.csv")
13
14
15df['combined_text'] = (
16    df['job_type'].fillna('') + ' ' +
17    df['job_title'].fillna('') + ' ' +
18    df['job_details'].fillna('')
19).str.lower()
20
21
22def rule_based_classify(text):
23   
24    text = text.lower()
25
26    if any(term in text for term in ['day rate', '/ day', '/day', 'daily rate', 'contract role', 'short term']) and not 'fixed term' in text:
27        return 'Short Term'
28
29 
30    if any(term in text for term in ['fixed term', 'ftc', 'fixed-term']):
31        return 'Fixed Term'
32
33    
34    if 'full time' in text and not any(term in text for term in ['contract', 'ftc', 'fixed term', 'short term', 'day rate']):
35        return 'Permanent'
36
37    
38    if 'casual' in text:
39        return 'Casual'
40
41   
42    if 'contract' in text:
43        return 'Short Term'
44
45    return 'Permanent'
46
47
48df['initial_class'] = df['combined_text'].apply(rule_based_classify)
49
50
51print("\n🔍 Sample classification examples:")
52print(df[['job_title', 'initial_class']].head(10))
53
54
55df_ml = df.copy()
56df_ml = df_ml[df_ml['initial_class'].notnull()]  
57
58
59X_train, X_test, y_train, y_test = train_test_split(
60    df_ml['combined_text'], df_ml['initial_class'], test_size=0.2, random_state=42
61)
62
63
64model = Pipeline([
65    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
66    ('clf', LogisticRegression(max_iter=1000))
67])
68
69model.fit(X_train, y_train)
70
71df['job_classification'] = model.predict(df['combined_text'])
72
73
74for title, classification in zip(df['job_title'], df['job_classification']):
75    print(f"{classification:<15} | {title}")

Python

1import pandas as pd
2import matplotlib.pyplot as plt
3import re
4from sklearn.feature_extraction.text import TfidfVectorizer
5from sklearn.model_selection import train_test_split
6from sklearn.linear_model import LogisticRegression
7from sklearn.pipeline import Pipeline
8from google.colab import files
9
10
11uploaded = files.upload()
12df = pd.read_csv("seek_jobs.csv")
13
14
15df['combined_text'] = (
16    df['job_type'].fillna('') + ' ' +
17    df['job_title'].fillna('') + ' ' +
18    df['job_details'].fillna('')
19).str.lower()
20
21
22def rule_based_classify(text):
23   
24    text = text.lower()
25
26    if any(term in text for term in ['day rate', '/ day', '/day', 'daily rate', 'contract role', 'short term']) and not 'fixed term' in text:
27        return 'Short Term'
28
29 
30    if any(term in text for term in ['fixed term', 'ftc', 'fixed-term']):
31        return 'Fixed Term'
32
33    
34    if 'full time' in text and not any(term in text for term in ['contract', 'ftc', 'fixed term', 'short term', 'day rate']):
35        return 'Permanent'
36
37    
38    if 'casual' in text:
39        return 'Casual'
40
41   
42    if 'contract' in text:
43        return 'Short Term'
44
45    return 'Permanent'
46
47
48df['initial_class'] = df['combined_text'].apply(rule_based_classify)
49
50
51print("\n🔍 Sample classification examples:")
52print(df[['job_title', 'initial_class']].head(10))
53
54
55df_ml = df.copy()
56df_ml = df_ml[df_ml['initial_class'].notnull()]  
57
58
59X_train, X_test, y_train, y_test = train_test_split(
60    df_ml['combined_text'], df_ml['initial_class'], test_size=0.2, random_state=42
61)
62
63
64model = Pipeline([
65    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
66    ('clf', LogisticRegression(max_iter=1000))
67])
68
69model.fit(X_train, y_train)
70
71df['job_classification'] = model.predict(df['combined_text'])
72
73
74for title, classification in zip(df['job_title'], df['job_classification']):
75    print(f"{classification:<15} | {title}")

Job Classification {Permanent, short term, fixed term, casual} (Source code)

Next Up

Lesson 25: Tool extraction from dataset (Source code)

Next Up

Lesson 25: Tool extraction from dataset (Source code)