Files

8.6 KiB
Raw Permalink Blame History

Practical-4 (Recurrent Neural Network - Google Stock Price Dataset)

Problem Statement: Recurrent neural network (RNN): Use the Google stock prices dataset and design a time series analysis and prediction system using RNN.

Note

Dataset available in Datasets directory. In the code, dataset is downloaded directly from Keras/TensorFlow in 2nd step (Load Dataset)


Pre-requisities

  1. Install packages using pip: pip install tensorflow keras numpy pandas matplotlib scikit-learn yfinance (tensorflow requires Python 3.9 - 3.12)

Steps

  1. Import Libraries
  2. Load Dataset
  3. Exploratory Data Analysis (EDA)
  4. Visualize Closing Price Over Time
  5. Preprocess Data - Normalize Closing Price
  6. Create Sequences for RNN Input
  7. Build the RNN Model
  8. Train the Model
  9. Plot Training vs Validation Loss
  10. Make Predictions and Inverse Scale
  11. Evaluate the Model
  12. Plot Actual vs Predicted Stock Price
  13. Forecast Next 30 Days

Code

1. Import Libraries:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, SimpleRNN, Dropout
from tensorflow.keras.callbacks import EarlyStopping

2. Load Dataset:

# Downloads GOOGL stock data from Yahoo Finance for the given date range
ticker = "GOOGL"
df = yf.download(ticker, start="2018-01-01", end="2024-01-01")

# --- Offline alternative (comment out the yf.download above and use this instead if using local dataset) ---
# df = pd.read_csv('GOOGL.csv', index_col='Date', parse_dates=True)
# df = df.sort_index()  # ensure chronological order

# yfinance returns MultiIndex columns — flatten to single level
df.columns = df.columns.get_level_values(0)

print(f"Dataset Shape: {df.shape}")
print(f"Date Range: {df.index.min().date()} to {df.index.max().date()}")
print(df.head())

3. Exploratory Data Analysis (EDA):

print("=== Dataset Info ===")
print(df.info())
print("\n=== Statistical Summary ===")
print(df.describe())
print("\n=== Missing Values ===")
print(df.isnull().sum())

4. Visualize Closing Price Over Time:

plt.figure(figsize=(16, 6))
plt.plot(df.index, df['Close'], color='steelblue', linewidth=1.5, label='Close Price')
plt.title('Google (GOOGL) Stock Closing Price (20182024)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

5. Preprocess Data - Normalize Closing Price:

data = df[['Close']].values   # use only Close price for prediction

scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data)  # scale values to [0, 1]

print(f"Original data range: [{data.min():.2f}, {data.max():.2f}]")
print(f"Scaled data range:   [{data_scaled.min():.4f}, {data_scaled.max():.4f}]")
print(f"Total data points:   {len(data_scaled)}")

6. Create Sequences for RNN Input:

def create_sequences(data, time_steps=60):
    X, y = [], []
    for i in range(time_steps, len(data)):
        X.append(data[i - time_steps:i, 0])  # window of past `time_steps` days
        y.append(data[i, 0])                  # next day's price
    return np.array(X), np.array(y)

TIME_STEPS = 60  # use past 60 days to predict the next day

# 80/20 train-test split (manual, to preserve time order)
train_size = int(len(data_scaled) * 0.80)
train_data = data_scaled[:train_size]
test_data  = data_scaled[train_size - TIME_STEPS:]  # overlap ensures test sequences start correctly

X_train, y_train = create_sequences(train_data, TIME_STEPS)
X_test,  y_test  = create_sequences(test_data,  TIME_STEPS)

# Reshape to [samples, time_steps, features] — required format for RNN layers
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test  = X_test.reshape((X_test.shape[0],   X_test.shape[1],  1))

print(f"Training samples: {X_train.shape}")
print(f"Testing samples:  {X_test.shape}")

7. Build the RNN Model:

model = Sequential()

model.add(Input(shape=(TIME_STEPS, 1)))                               # input: sequence of 60 days
model.add(SimpleRNN(units=64, return_sequences=True))                 # first RNN layer, passes output to next
model.add(Dropout(0.2))                                               # drop 20% neurons to reduce overfitting
model.add(SimpleRNN(units=64, return_sequences=False))                # second RNN layer, outputs single vector
model.add(Dropout(0.2))
model.add(Dense(units=32, activation='relu'))                         # fully connected layer
model.add(Dense(units=1))                                             # output: single predicted price

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.summary()

8. Train the Model:

# EarlyStopping stops training if val_loss doesn't improve for 10 consecutive epochs
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=60,
    batch_size=32,
    validation_split=0.1,   # use 10% of training data for validation
    callbacks=[early_stop],
    verbose=1
)
print(f"\nTraining stopped at epoch: {len(history.history['loss'])}")

9. Plot Training vs Validation Loss:

plt.plot(history.history['loss'], label='Train Loss', color='royalblue')
plt.plot(history.history['val_loss'], label='Val Loss', color='tomato')
plt.title('Model Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

10. Make Predictions and Inverse Scale:

y_pred_scaled = model.predict(X_test)

# Convert scaled predictions back to original USD price range
y_pred   = scaler.inverse_transform(y_pred_scaled)
y_actual = scaler.inverse_transform(y_test.reshape(-1, 1))

print(f"Sample predictions (first 5): {y_pred[:5].flatten().round(2)}")
print(f"Actual values      (first 5): {y_actual[:5].flatten().round(2)}")

11. Evaluate the Model:

mse  = mean_squared_error(y_actual, y_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_actual, y_pred)
mape = np.mean(np.abs((y_actual - y_pred) / y_actual)) * 100  # mean absolute percentage error

print("=" * 40)
print("     MODEL EVALUATION METRICS")
print("=" * 40)
print(f"  MSE  : {mse:.4f}")
print(f"  RMSE : {rmse:.4f}")
print(f"  MAE  : {mae:.4f}")
print(f"  MAPE : {mape:.2f}%")
print("=" * 40)

12. Plot Actual vs Predicted Stock Price:

test_dates = df.index[train_size:]   # align dates with test predictions

plt.figure(figsize=(16, 6))
plt.plot(test_dates, y_actual, label='Actual Price',    color='steelblue', linewidth=1.5)
plt.plot(test_dates, y_pred,   label='Predicted Price', color='tomato',    linewidth=1.5, linestyle='--')
plt.title('Google Stock Price: Actual vs Predicted (RNN)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

13. Forecast Next 30 Days:

n_future = 30  # number of future days to predict

# Seed the forecast with the last TIME_STEPS days of known data
future_input       = data_scaled[-TIME_STEPS:].reshape(1, TIME_STEPS, 1)
future_predictions = []

for _ in range(n_future):
    pred = model.predict(future_input, verbose=0)
    future_predictions.append(pred[0, 0])
    # Slide the window: drop oldest day, append new prediction
    future_input = np.append(future_input[:, 1:, :], pred.reshape(1, 1, 1), axis=1)

# Inverse scale forecasted prices back to USD
future_prices = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1))

# Generate business day dates starting from the day after last known date
last_date    = df.index[-1]
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=n_future, freq='B')

plt.figure(figsize=(16, 6))
plt.plot(df.index[-120:], scaler.inverse_transform(data_scaled[-120:]),
         label='Historical', color='steelblue', linewidth=1.5)
plt.plot(future_dates, future_prices,
         label='30-Day Forecast', color='orange', linewidth=1.5)
plt.axvline(x=last_date, color='gray', linestyle='--', label='Forecast Start')
plt.title('Google Stock — 30-Day Future Price Forecast (RNN)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nForecasted price range: {future_prices.min():.2f} USD - {future_prices.max():.2f} USD")

Miscellaneous