import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('weather_stations.csv')
df.head()
# Drop rows with missing values
df.dropna(inplace=True)
# Display basic info
df.info()
features = df[[
'Data.Temperature.Avg Temp',
'Data.Temperature.Max Temp',
'Data.Temperature.Min Temp',
'Data.Wind.Speed',
'Data.Precipitation'
]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Apply DBSCAN
dbscan = DBSCAN(eps=1.3, min_samples=5) # You can tweak eps/min_samples
clusters = dbscan.fit_predict(scaled_features)
# Add cluster labels to original dataframe
df = df.loc[features.index] # Keep only rows with complete features
df['Cluster'] = clusters
# Plot Clusters (Using first 2 temp features for visualization)
plt.figure(figsize=(10, 6))
sns.scatterplot(
x=df['Data.Temperature.Avg Temp'],
y=df['Data.Temperature.Max Temp'],
hue=df['Cluster'],
palette='tab10',
style=(df['Cluster'] == -1),
s=100
)
plt.title('DBSCAN Clustering of Weather Stations')
plt.xlabel('Average Temperature')
plt.ylabel('Max Temperature')
plt.legend(title='Cluster')
plt.show()
# Plot Only Outliers
outliers = df[df['Cluster'] == -1]
plt.figure(figsize=(10, 6))
plt.scatter(
outliers['Data.Temperature.Avg Temp'],
outliers['Data.Temperature.Max Temp'],
c='red',
label='Outliers',
s=100
)
plt.xlabel('Average Temperature')
plt.ylabel('Max Temperature')
plt.title('Outlier Weather Stations Detected by DBSCAN')
plt.legend()
plt.show()
Comments
Post a Comment