Electricity Prediction using K-Neighbors Regression
- saman aboutorab
- Jan 18, 2024
- 1 min read
This notebook is an introduction to the machine-learning concepts of clustering and prediction using regression. We will use the Building Data Genome Project data set to analyze electrical meter data from non-residential buildings.

We will train the model on a few simple time series features as well as outdoor air temperature to predict how much energy a building uses. For this demonstration, we will use three months of data from April, May, and June to prediction July.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import sklearn
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from scipy.cluster.vq import kmeans, vq, whiten
from scipy.spatial.distance import cdist
import numpy as np
from datetime import datetime Load meter data from a single buildingdirectory = 'meter_data/'
df = pd.read_csv(directory + 'Office_Amelie.csv', index_col = "timestamp", parse_dates=True) df.plot(alpha=0.5, figsize=(15, 5))
plt.title("Electricity Consumption")
plt.xlabel("Time Range")
plt.ylabel("kWh Electricity Consumption Visualization"); df.truncate(before='01-02-2015', after='14-02-2015').plot(figsize=(15,5))
plt.title("Electricity Consumption")
plt.xlabel("Time Range")
plt.ylabel("kWh Electricity Consumption Visualization"); Conventional Daily Profile Analysis - Weekday vs. Weekend
df['Date'] = df.index.map(lambda t: t.date())
df['Time'] = df.index.map(lambda t: t.time()) df_pivot = pd.pivot_table(df, values='Office_Amelie', index='Date', columns='Time') df_pivot.T.plot(legend=False, figsize=(15,5), color='k', alpha=0.1, xticks=np.arange(0, 86400, 10800))
plt.title("Electrical Meter Data - Daily Profiles")
plt.xlabel("Daily Time Frame")
plt.ylabel("kWh Electricity"); df['Weekday'] = df.index.map(lambda t: t.date().weekday()) df_pivot_weekday = pd.pivot_table(df[(df.Weekday < 5)], values='Office_Amelie', index='Date', columns='Time') df_pivot_weekday.T.plot(legend=False, figsize=(15,5), color='k', alpha=0.1, xticks=np.arange(0, 86400, 10800))
plt.title("Electrical Meter Data - Weekday Daily Profiles")
plt.xlabel("Daily Time Frame")
plt.ylabel("kWh Electricity"); Manual identification of clustersdf_pivot_weekend = pd.pivot_table(df[(df.Weekday > 5)], values='Office_Amelie', index='Date', columns='Time')
df_pivot_weekend.T.plot(legend=False, figsize=(15,5), color='k', alpha=0.1, xticks=np.arange(0, 86400, 10800))
plt.title("Electrical Meter Data - Weekday Daily Profiles")
plt.xlabel("Daily Time Frame")
plt.ylabel("kWh Electricity"); k-Means Clustering of Daily Load Profilesdf = pd.read_csv(directory + 'Office_Amelie.csv', index_col = "timestamp", parse_dates=True) df_norm = (df - df.mean()) / (df.max() - df.min())
df['Time'] = df.index.map(lambda t: t.time())
df['Date'] = df.index.map(lambda t: t.date())
df_norm['Time'] = df_norm.index.map(lambda t: t.time())
df_norm['Date'] = df_norm.index.map(lambda t: t.date()) dailyblocks = pd.pivot_table(df, values='Office_Amelie', index='Date', columns='Time', aggfunc='mean')
dailyblocks_norm = pd.pivot_table(df_norm, values='Office_Amelie', index='Date', columns='Time', aggfunc='mean') The Clustering Modeldailyblocksmatrix_norm = np.matrix(dailyblocks_norm.dropna())
centers, _ = kmeans(dailyblocksmatrix_norm, 4, iter=10000)
cluster, _ = vq(dailyblocksmatrix_norm, centers) clusterdf = pd.DataFrame(cluster, columns=['ClusterNo']) dailyclusters = pd.concat([dailyblocks.dropna().reset_index(), clusterdf], axis=1) x = dailyclusters.groupby('ClusterNo').mean().sum(axis=1).sort_values()
x = pd.DataFrame(x.reset_index())
x['ClusterNo2'] = x.index
x = x.set_index('ClusterNo')
x = x.drop([0], axis=1)
dailyclusters = dailyclusters.merge(x, how='outer', left_on='ClusterNo', right_index=True) dailyclusters = dailyclusters.drop(['ClusterNo'],axis=1)
dailyclusters = dailyclusters.set_index(['ClusterNo2','Date']).T.sort_index() clusterlist = list(dailyclusters.columns.get_level_values(0).unique())
matplotlib.rcParams['figure.figsize'] = 20, 7
styles2 = ['LightSkyBlue', 'b','LightGreen', 'g','LightCoral','r','SandyBrown','Orange','Plum','Purple','Gold','b']
fig, ax = plt.subplots()
for col, style in zip(clusterlist, styles2):
dailyclusters[col].plot(ax=ax, legend=False, style=style, alpha=0.1, xticks=np.arange(0, 86400, 10800))
ax.set_ylabel('Total Daily Profile')
ax.set_xlabel('Time of Day'); Aggregate visualizations of the clustersdef timestampcombine(date,time):
pydatetime = datetime.combine(date, time)
return pydatetime def ClusterUnstacker(df):
df = df.unstack().reset_index()
df['timestampstring'] = pd.to_datetime(df.Date.astype("str") + " " + df.level_2.astype("str"))
#pd.to_datetime(df.Date df.level_2) #map(timestampcombine, )
df = df.dropna()
return df dailyclusters.unstack().reset_index().head() dfclusterunstacked = ClusterUnstacker(dailyclusters)
dfclusterunstackedpivoted = pd.pivot_table(dfclusterunstacked, values=0, index='timestampstring', columns='ClusterNo2') clusteravgplot = dfclusterunstackedpivoted.resample('D').sum().replace(0, np.nan).plot(style="^",markersize=15)
clusteravgplot.set_ylabel('Daily Totals kWh')
clusteravgplot.set_xlabel('Date'); dfclusterunstackedpivoted['Time'] = dfclusterunstackedpivoted.index.map(lambda t: t.time())
dailyprofile = dfclusterunstackedpivoted.groupby('Time').mean().plot(figsize=(20,7),linewidth=3, xticks=np.arange(0, 86400, 10800))
dailyprofile.set_ylabel('Average Daily Profile kWh')
dailyprofile.set_xlabel('Time of Day')
dailyprofile.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Cluster') def DayvsClusterMaker(df):
df.index = df.timestampstring
df['Weekday'] = df.index.map(lambda t: t.date().weekday())
df['Date'] = df.index.map(lambda t: t.date())
df['Time'] = df.index.map(lambda t: t.time())
DayVsCluster = df.resample('D').mean().reset_index(drop=True)
DayVsCluster = pd.pivot_table(DayVsCluster, values=0, index='ClusterNo2', columns='Weekday', aggfunc='count')
DayVsCluster.columns = ['Mon','Tue','Wed','Thur','Fri','Sat','Sun']
return DayVsCluster.T DayVsCluster = DayvsClusterMaker(dfclusterunstacked)
DayVsClusterplot1 = DayVsCluster.plot(figsize=(20,7),kind='bar',stacked=True)
DayVsClusterplot1.set_ylabel('Number of Days in Each Cluster')
DayVsClusterplot1.set_xlabel('Day of the Week')
DayVsClusterplot1.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Cluster') DayVsClusterplot2 = DayVsCluster.T.plot(figsize=(20,7),kind='bar',stacked=True, color=['b','g','r','c','m','y','k']) #, color=colors2
DayVsClusterplot2.set_ylabel('Number of Days in Each Cluster')
DayVsClusterplot2.set_xlabel('Cluster Number')
DayVsClusterplot2.legend(loc='center left', bbox_to_anchor=(1, 0.5)) Electricity Prediction using Regression for Measurement and VerificationThe graphic below comes from the IPMVP to show how prediction can be used for M&V to calculate how much energy would have been consumed if an energy savings intervention had not been implemented. Load electricity data and weather datadf_prediction_data = pd.read_csv(directory + "UnivClass_Ciara.csv", parse_dates=True, index_col='timestamp') df_prediction_data.plot() directory_weather = 'weather_data/'
weather_data = pd.read_csv(directory_weather + "weather2.csv", index_col='timestamp', parse_dates=True) weather_hourly = weather_data.resample("H").mean()
weather_hourly_nooutlier = weather_hourly[weather_hourly > -40]
weather_hourly_nooutlier_nogaps = weather_hourly_nooutlier.fillna(method='ffill') temperature = weather_hourly_nooutlier_nogaps["TemperatureC"] temperature.plot() Create Train and Test Datasets for Supervised Learningtraining_months = [4,5,6]
test_months = [7] trainingdata = df_prediction_data[df_prediction_data.index.month.isin(training_months)]
testdata = df_prediction_data[df_prediction_data.index.month.isin(test_months)] train_features = pd.concat([pd.get_dummies(trainingdata.index.hour),
pd.get_dummies(trainingdata.index.dayofweek),
pd.DataFrame(temperature[temperature.index.month.isin(training_months)].values)], axis=1).dropna()
Train a K-Neighbor Regressor Modelmodel = KNeighborsRegressor().fit(np.array(train_features), np.array(trainingdata.values)); test_features = np.array(pd.concat([pd.get_dummies(testdata.index.hour),
pd.get_dummies(testdata.index.dayofweek),
pd.DataFrame(temperature[temperature.index.month.isin(test_months)].values)], axis=1).dropna()) predictions = model.predict(test_features) predicted_vs_actual = pd.concat([testdata, pd.DataFrame(predictions, index=testdata.index)], axis=1) predicted_vs_actual.columns = ["Actual", "Predicted"] predicted_vs_actual.plot() trainingdata.columns = ["Actual"] predicted_vs_actual_plus_training = pd.concat([trainingdata, predicted_vs_actual], sort=True) predicted_vs_actual_plus_training.plot() Regression evaluation metrics# Calculate the absolute errors
errors = abs(predicted_vs_actual['Predicted'] - predicted_vs_actual['Actual'])
# Calculate mean absolute percentage error (MAPE) and add to list
MAPE = np.mean((errors / predicted_vs_actual['Actual'])) 0.06875718739673341
|
Reference:
https://cargocollective.com/buildingdata/DayFilter-Unsupervised-Pattern-Filtering
https://www.mdpi.com/2504-4990/1/3/56
Comments