#suppress warnings for a clean notebook just to moderate error messages
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Loading historical price data
# I had to clean the date column because the entries were wrongly set using microsoft excel
df = pd.read_csv(r'C:\Users\PC\Desktop\KIbe\sem 2\Unstructured data analytics & apps\jupkibe\data\cpi.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
data = df['Kenya CPI'].values.reshape(-1, 1)
#Data exploration
df.head()
# results with weighted averages
Lower Income | Middle Income | Upper Income | Nairobi Combined | CPI of Urban Areas | Kenya CPI | |
---|---|---|---|---|---|---|
Date | ||||||
2018-01-01 | 191.45 | 159.79 | 156.07 | 182.48 | 187.57 | 185.47 |
2018-02-01 | 194.37 | 160.71 | 157.31 | 184.85 | 190.20 | 188.00 |
2018-03-01 | 198.16 | 161.43 | 157.56 | 187.76 | 192.62 | 190.62 |
2018-04-01 | 200.32 | 161.79 | 157.57 | 189.41 | 195.82 | 193.18 |
2018-05-01 | 202.10 | 163.00 | 157.92 | 191.00 | 197.89 | 195.05 |
#Data exploration
# Visualize the data
#plt.figure(figsize=(12, 6))
plt.plot(df)
plt.title('Historical Price Index')
plt.xlabel('Year')
plt.ylabel('Kenya Consumer Price Index')
plt.show()
correlation=df.corr()
correlation['Kenya CPI'].sort_values(ascending=True)
Upper Income 0.671732 Lower Income 0.780329 Middle Income 0.811479 Nairobi Combined 0.917577 CPI of Urban Areas 0.974351 Kenya CPI 1.000000 Name: Kenya CPI, dtype: float64
#Data exploration
# Extract relevant columns of interest
#independent
df = df[['Kenya CPI']]
#depedent if and only if
#df = df[['Date']]
df.head()
Kenya CPI | |
---|---|
Date | |
2018-01-01 | 185.47 |
2018-02-01 | 188.00 |
2018-03-01 | 190.62 |
2018-04-01 | 193.18 |
2018-05-01 | 195.05 |
#look for missing values in our select column of interest
df.isna().sum()
#the first instance, it found nan values but after cleaning the data in excel the algorithmn run zero nan values
Kenya CPI 0 dtype: int64
plt.plot(data)
plt.title('Historical Price Index')
plt.xlabel('Year')
plt.ylabel('Kenya Consumer Price Index')
plt.show()
# Normalize the data using Min-Max scaling
# Min-Max scaling is a technique for normalizing the data,
# ........which means transforming it in a way that the values fall within a specific range
# Sacaling ensures the data has a similar scale across features and that it falls within a range that is
# .......... suitable for the neural network to learn effectively.
scaler = MinMaxScaler()
data = scaler.fit_transform(data) # data defined above
# standard basic RNN procedures
# EG Create sequences for training
# Sequences are essential for training RNNs because they can capture temporal dependencies in the data
sequence_length = 10 # You can adjust this value
X, y = [], []
# for loop ensures that you don't go out of bounds when creating sequences.
for i in range(len(data) - sequence_length):
X.append(data[i:i+sequence_length])
y.append(data[i+sequence_length])
X = np.array(X)
y = np.array(y)
# Split the data into training and testing sets
# Recognize we are using 80% for training and 20 for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# Build the Long Short-Term Memory (LSTM) model
# example of a simple RNN model using the Long Short-Term Memory (LSTM) architecture
model = keras.Sequential([
keras.layers.LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
keras.layers.Dense(1)
])
#We can also try SGD or RMSprop in place of adam or vise versa.
#Basically these concepts adapts the learning rate during training to improve convergence
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
# epoch def: This parameter specifies the number of times the model will iterate through your entire dataset.
#therefore an epoch represents one complete pass through the entire training dataset
model.fit(X_train, y_train, epochs=50, batch_size=30)
#if adam was used loss started at 0.0042 - 0.3880- 0.450
#if sgd was used insted of adam - loss started at 0.3907 - 0.4032
# if RMSprop was used - loss started at 0.4250
Epoch 1/50 1/1 [==============================] - 2s 2s/step - loss: 0.5639 Epoch 2/50 1/1 [==============================] - 0s 16ms/step - loss: 0.5379 Epoch 3/50 1/1 [==============================] - 0s 22ms/step - loss: 0.5128 Epoch 4/50 1/1 [==============================] - 0s 21ms/step - loss: 0.4888 Epoch 5/50 1/1 [==============================] - 0s 21ms/step - loss: 0.4657 Epoch 6/50 1/1 [==============================] - 0s 31ms/step - loss: 0.4434 Epoch 7/50 1/1 [==============================] - 0s 25ms/step - loss: 0.4220 Epoch 8/50 1/1 [==============================] - 0s 21ms/step - loss: 0.4014 Epoch 9/50 1/1 [==============================] - 0s 23ms/step - loss: 0.3815 Epoch 10/50 1/1 [==============================] - 0s 19ms/step - loss: 0.3624 Epoch 11/50 1/1 [==============================] - 0s 23ms/step - loss: 0.3440 Epoch 12/50 1/1 [==============================] - 0s 18ms/step - loss: 0.3265 Epoch 13/50 1/1 [==============================] - 0s 24ms/step - loss: 0.3099 Epoch 14/50 1/1 [==============================] - 0s 20ms/step - loss: 0.2941 Epoch 15/50 1/1 [==============================] - 0s 17ms/step - loss: 0.2787 Epoch 16/50 1/1 [==============================] - 0s 19ms/step - loss: 0.2637 Epoch 17/50 1/1 [==============================] - 0s 18ms/step - loss: 0.2492 Epoch 18/50 1/1 [==============================] - 0s 17ms/step - loss: 0.2352 Epoch 19/50 1/1 [==============================] - 0s 18ms/step - loss: 0.2215 Epoch 20/50 1/1 [==============================] - 0s 18ms/step - loss: 0.2082 Epoch 21/50 1/1 [==============================] - 0s 18ms/step - loss: 0.1952 Epoch 22/50 1/1 [==============================] - 0s 17ms/step - loss: 0.1824 Epoch 23/50 1/1 [==============================] - 0s 17ms/step - loss: 0.1700 Epoch 24/50 1/1 [==============================] - 0s 22ms/step - loss: 0.1578 Epoch 25/50 1/1 [==============================] - 0s 18ms/step - loss: 0.1456 Epoch 26/50 1/1 [==============================] - 0s 24ms/step - loss: 0.1334 Epoch 27/50 1/1 [==============================] - 0s 18ms/step - loss: 0.1213 Epoch 28/50 1/1 [==============================] - 0s 19ms/step - loss: 0.1092 Epoch 29/50 1/1 [==============================] - 0s 23ms/step - loss: 0.0971 Epoch 30/50 1/1 [==============================] - 0s 24ms/step - loss: 0.0852 Epoch 31/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0735 Epoch 32/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0620 Epoch 33/50 1/1 [==============================] - 0s 19ms/step - loss: 0.0510 Epoch 34/50 1/1 [==============================] - 0s 17ms/step - loss: 0.0406 Epoch 35/50 1/1 [==============================] - 0s 22ms/step - loss: 0.0309 Epoch 36/50 1/1 [==============================] - 0s 22ms/step - loss: 0.0223 Epoch 37/50 1/1 [==============================] - 0s 21ms/step - loss: 0.0149 Epoch 38/50 1/1 [==============================] - 0s 17ms/step - loss: 0.0092 Epoch 39/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0055 Epoch 40/50 1/1 [==============================] - 0s 17ms/step - loss: 0.0040 Epoch 41/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0048 Epoch 42/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0075 Epoch 43/50 1/1 [==============================] - 0s 19ms/step - loss: 0.0114 Epoch 44/50 1/1 [==============================] - 0s 17ms/step - loss: 0.0151 Epoch 45/50 1/1 [==============================] - 0s 21ms/step - loss: 0.0176 Epoch 46/50 1/1 [==============================] - 0s 17ms/step - loss: 0.0182 Epoch 47/50 1/1 [==============================] - 0s 20ms/step - loss: 0.0172 Epoch 48/50 1/1 [==============================] - 0s 19ms/step - loss: 0.0150 Epoch 49/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0122 Epoch 50/50 1/1 [==============================] - 0s 18ms/step - loss: 0.0094
<keras.src.callbacks.History at 0x192024b6310>
print("Actual values in the array:")
print(X_test)
#print("Actual value for the first sample:", X_test[0])
Actual values in the array: [[[0.71279762] [0.58333333] [0.45535714] [0.49925595] [0.64508929] [0.5297619 ] [0.50446429] [0.59821429] [0.64806548] [0.7671131 ]]]
# Make predictions on the test data
predictions = model.predict(X_test)
# We now need to Inverse transform the predictions to our original scale,
#.....because we used scaler as a standard procedure for RNNS
# ... because now we want to interpret the predictions in their original context.
predictions = scaler.inverse_transform(predictions) # Inverse scaling
predictions
pred_y = model.predict(X_test)
pred_y #both give same predictions
1/1 [==============================] - 0s 264ms/step 1/1 [==============================] - 0s 32ms/step
array([[0.752218]], dtype=float32)
#I took an approximate average of the actual target values for the test sample
#actual_value = 1.0
actual_value = 0.6
# The model's prediction as above
prediction = 0.71092254
# Calculate metrics
mse = mean_squared_error([actual_value], [prediction])
rmse = np.sqrt(mse)
mae = mean_absolute_error([actual_value], [prediction])
r2 = r2_score([actual_value], [prediction])
print(f'Mean Squared Error (MSE): {mse}') #Lower values indicate better performance.
print(f'Root Mean Squared Error (RMSE): {rmse}') #average magnitude of the error, the lower the better
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R²): {r2}')
Mean Squared Error (MSE): 0.012303809880051603 Root Mean Squared Error (RMSE): 0.11092254000000001 Mean Absolute Error (MAE): 0.11092254000000001 R-squared (R²): nan
Our values for MSE, RMSE, and MAE suggest that the predictions are very close to the true target values, and there may not be much variation in the true target values, which now leads to the "nan" R² value.
# Evaluate the model
loss = model.evaluate(X_test, y_test)
#loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
1/1 [==============================] - 0s 322ms/step - loss: 0.0614 Test Loss: 0.06139591708779335
In summarry
#import matplotlib.pyplot as plt
# Visualize predictions
#plt.figure(figsize=(12, 6))
#plt.plot(df.index[-len(y_test):], y_test, label='Original Values')
#plt.plot(df.index[-len(y_test):], predictions, label='Predictions', linestyle='dashed')
#plt.xlabel('YEAR')
#plt.ylabel('Kenya CPI')
#plt.legend()
#plt.title('Price Index Prediction with LSTM')
#plt.show()
# number of future time periods to forecast defined here
future_periods = 30 # wse can always adjust this as needed
# Create a date range for the future time periods
future_dates = pd.date_range(start=df.index[-1], periods=future_periods + 1, closed='right')
# Initialize an array for forecasted values
forecasted_values = []
# Make predictions for future time periods
for i in range(future_periods):
# Select the most recent sequence of data (this is defined on cell one for kenya cpi)
recent_sequence = data[-sequence_length:]
# Reshape the sequence to match the model's input shape
recent_sequence = recent_sequence.reshape(1, sequence_length, 1)
# Predict the next value
next_value = model.predict(recent_sequence)
# Append the predicted value to the dataset
data = np.append(data, next_value, axis=0) #(data:this is defined on cell one for kenya cpi)
# Append the predicted value to the list of forecasted values
forecasted_values.append(next_value)
# Inverse transform the forecasted values to the original scale
#forecasted_values = scaler.inverse_transform(forecasted_values).flatten()
predictions = scaler.inverse_transform(predictions)
# Create a DataFrame for the forecasted values
forecasted_df = pd.DataFrame({'Date': future_dates[0:], 'Forecasted_CPI': forecasted_values})
# Concatenate the original data with the forecasted values
complete_df = pd.concat([df, forecasted_df], axis=0)
# Visualize the original data and the forecasted values
plt.figure(figsize=(12, 6))
plt.plot(complete_df['Date'], complete_df['Kenya CPI'], label='Original Data')
plt.plot(forecasted_df['Date'], forecasted_df['Forecasted_CPI'], label='Forecasted Values', linestyle='dashed')
plt.title('Forecasting Future Kenya CPI with LSTM')
plt.xlabel('Year')
plt.ylabel('Kenya Consumer Price Index')
plt.legend()
plt.show()
1/1 [==============================] - 0s 33ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 33ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 33ms/step 1/1 [==============================] - 0s 35ms/step 1/1 [==============================] - 0s 34ms/step 1/1 [==============================] - 0s 51ms/step 1/1 [==============================] - 0s 53ms/step 1/1 [==============================] - 0s 35ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 47ms/step 1/1 [==============================] - 0s 37ms/step 1/1 [==============================] - 0s 38ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 28ms/step 1/1 [==============================] - 0s 29ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 36ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 30ms/step 1/1 [==============================] - 0s 32ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 31ms/step 1/1 [==============================] - 0s 31ms/step