# Feature engineering for modeling df['Month'] = pd.to_datetime(df['Month']) df['Year'] = df['Month'].dt.year df['MonthNum'] = df['Month'].dt.month # Aggregate to LSOA-month level monthly_hotspots = df.groupby( ['LSOA name', 'Year', 'MonthNum'] ).size().reset_index(name='CrimeCount')
# Binary hotspot label (upper quartile) hotspot_threshold = monthly_hotspots['CrimeCount'].quantile(0.75) monthly_hotspots['Hotspot'] = ( monthly_hotspots['CrimeCount'] >= hotspot_threshold ).astype(int) print('Hotspot threshold:', hotspot_threshold) print(monthly_hotspots['Hotspot'].value_counts(normalize=True))
X = model_df[['LSOA name', 'Year', 'MonthNum']] y = model_df['Hotspot'] preprocessor = ColumnTransformer([ ('cat', Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]), ['LSOA name']), ('num', Pipeline([ ('imputer', SimpleImputer(strategy='median')) ]), ['Year', 'MonthNum']) ])
rf = RandomForestClassifier( n_estimators=300, max_depth=12, class_weight='balanced', random_state=42, n_jobs=1 ) pipeline = Pipeline([ ('prep', preprocessor), ('model', rf) ]) pipeline.fit(X_train, y_train) preds = pipeline.predict(X_test)
param_grid = {
'model__n_estimators': [200, 400],
'model__max_depth': [12, None],
'model__min_samples_leaf': [1, 2],
'model__class_weight': [
'balanced',
'balanced_subsample'
]
}
grid_search = GridSearchCV(
rf_pipeline, param_grid,
cv=3, scoring='f1', n_jobs=1
)
grid_search.fit(X_train, y_train)