Bexley Crime Hotspot Predictor

Hotspot Engineering ▼

        
        
      

        # Feature engineering for modeling
df['Month'] = pd.to_datetime(df['Month'])
df['Year'] = df['Month'].dt.year
df['MonthNum'] = df['Month'].dt.month

# Aggregate to LSOA-month level
monthly_hotspots = df.groupby(
    ['LSOA name', 'Year', 'MonthNum']
).size().reset_index(name='CrimeCount')
      

        # Binary hotspot label (upper quartile)
hotspot_threshold = monthly_hotspots['CrimeCount'].quantile(0.75)

monthly_hotspots['Hotspot'] = (
    monthly_hotspots['CrimeCount'] >= hotspot_threshold
).astype(int)

print('Hotspot threshold:', hotspot_threshold)
print(monthly_hotspots['Hotspot'].value_counts(normalize=True))
      

Model Pipeline ▼

        
        
        
      

        X = model_df[['LSOA name', 'Year', 'MonthNum']]
y = model_df['Hotspot']

preprocessor = ColumnTransformer([
  ('cat', Pipeline([
      ('imputer', SimpleImputer(strategy='most_frequent')),
      ('onehot',  OneHotEncoder(handle_unknown='ignore'))
  ]), ['LSOA name']),
  ('num', Pipeline([
      ('imputer', SimpleImputer(strategy='median'))
  ]), ['Year', 'MonthNum'])
])
      

        rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    class_weight='balanced',
    random_state=42,
    n_jobs=1
)

pipeline = Pipeline([
    ('prep',  preprocessor),
    ('model', rf)
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
      

        param_grid = {
    'model__n_estimators':    [200, 400],
    'model__max_depth':       [12, None],
    'model__min_samples_leaf': [1, 2],
    'model__class_weight': [
        'balanced',
        'balanced_subsample'
    ]
}

grid_search = GridSearchCV(
    rf_pipeline, param_grid,
    cv=3, scoring='f1', n_jobs=1
)
grid_search.fit(X_train, y_train)