Files

277 lines
8.6 KiB
Markdown
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Practical-4 (Recurrent Neural Network - Google Stock Price Dataset)
Problem Statement: Recurrent neural network (RNN): Use the Google stock prices dataset and design a time series analysis and prediction system using RNN.
> [!NOTE]
> Dataset available in [Datasets](../Datasets/GOOG.csv) directory.
> In the code, dataset is downloaded directly from Keras/TensorFlow in 2nd step (Load Dataset)
---
## Pre-requisities
1. Install packages using `pip`: `pip install tensorflow keras numpy pandas matplotlib scikit-learn yfinance` (`tensorflow` requires Python 3.9 - 3.12)
## Steps
1. Import Libraries
2. Load Dataset
3. Exploratory Data Analysis (EDA)
4. Visualize Closing Price Over Time
5. Preprocess Data - Normalize Closing Price
6. Create Sequences for RNN Input
7. Build the RNN Model
8. Train the Model
9. Plot Training vs Validation Loss
10. Make Predictions and Inverse Scale
11. Evaluate the Model
12. Plot Actual vs Predicted Stock Price
13. Forecast Next 30 Days
---
## Code
### 1. Import Libraries:
```python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, SimpleRNN, Dropout
from tensorflow.keras.callbacks import EarlyStopping
```
### 2. Load Dataset:
```python3
# Downloads GOOGL stock data from Yahoo Finance for the given date range
ticker = "GOOGL"
df = yf.download(ticker, start="2018-01-01", end="2024-01-01")
# --- Offline alternative (comment out the yf.download above and use this instead if using local dataset) ---
# df = pd.read_csv('GOOGL.csv', index_col='Date', parse_dates=True)
# df = df.sort_index() # ensure chronological order
# yfinance returns MultiIndex columns — flatten to single level
df.columns = df.columns.get_level_values(0)
print(f"Dataset Shape: {df.shape}")
print(f"Date Range: {df.index.min().date()} to {df.index.max().date()}")
print(df.head())
```
### 3. Exploratory Data Analysis (EDA):
```python3
print("=== Dataset Info ===")
print(df.info())
print("\n=== Statistical Summary ===")
print(df.describe())
print("\n=== Missing Values ===")
print(df.isnull().sum())
```
### 4. Visualize Closing Price Over Time:
```python3
plt.figure(figsize=(16, 6))
plt.plot(df.index, df['Close'], color='steelblue', linewidth=1.5, label='Close Price')
plt.title('Google (GOOGL) Stock Closing Price (20182024)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
```
### 5. Preprocess Data - Normalize Closing Price:
```python3
data = df[['Close']].values # use only Close price for prediction
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data) # scale values to [0, 1]
print(f"Original data range: [{data.min():.2f}, {data.max():.2f}]")
print(f"Scaled data range: [{data_scaled.min():.4f}, {data_scaled.max():.4f}]")
print(f"Total data points: {len(data_scaled)}")
```
### 6. Create Sequences for RNN Input:
```python3
def create_sequences(data, time_steps=60):
X, y = [], []
for i in range(time_steps, len(data)):
X.append(data[i - time_steps:i, 0]) # window of past `time_steps` days
y.append(data[i, 0]) # next day's price
return np.array(X), np.array(y)
TIME_STEPS = 60 # use past 60 days to predict the next day
# 80/20 train-test split (manual, to preserve time order)
train_size = int(len(data_scaled) * 0.80)
train_data = data_scaled[:train_size]
test_data = data_scaled[train_size - TIME_STEPS:] # overlap ensures test sequences start correctly
X_train, y_train = create_sequences(train_data, TIME_STEPS)
X_test, y_test = create_sequences(test_data, TIME_STEPS)
# Reshape to [samples, time_steps, features] — required format for RNN layers
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
print(f"Training samples: {X_train.shape}")
print(f"Testing samples: {X_test.shape}")
```
### 7. Build the RNN Model:
```python3
model = Sequential()
model.add(Input(shape=(TIME_STEPS, 1))) # input: sequence of 60 days
model.add(SimpleRNN(units=64, return_sequences=True)) # first RNN layer, passes output to next
model.add(Dropout(0.2)) # drop 20% neurons to reduce overfitting
model.add(SimpleRNN(units=64, return_sequences=False)) # second RNN layer, outputs single vector
model.add(Dropout(0.2))
model.add(Dense(units=32, activation='relu')) # fully connected layer
model.add(Dense(units=1)) # output: single predicted price
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.summary()
```
### 8. Train the Model:
```python3
# EarlyStopping stops training if val_loss doesn't improve for 10 consecutive epochs
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
X_train, y_train,
epochs=60,
batch_size=32,
validation_split=0.1, # use 10% of training data for validation
callbacks=[early_stop],
verbose=1
)
print(f"\nTraining stopped at epoch: {len(history.history['loss'])}")
```
### 9. Plot Training vs Validation Loss:
```python3
plt.plot(history.history['loss'], label='Train Loss', color='royalblue')
plt.plot(history.history['val_loss'], label='Val Loss', color='tomato')
plt.title('Model Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
```
### 10. Make Predictions and Inverse Scale:
```python3
y_pred_scaled = model.predict(X_test)
# Convert scaled predictions back to original USD price range
y_pred = scaler.inverse_transform(y_pred_scaled)
y_actual = scaler.inverse_transform(y_test.reshape(-1, 1))
print(f"Sample predictions (first 5): {y_pred[:5].flatten().round(2)}")
print(f"Actual values (first 5): {y_actual[:5].flatten().round(2)}")
```
### 11. Evaluate the Model:
```python3
mse = mean_squared_error(y_actual, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_actual, y_pred)
mape = np.mean(np.abs((y_actual - y_pred) / y_actual)) * 100 # mean absolute percentage error
print("=" * 40)
print(" MODEL EVALUATION METRICS")
print("=" * 40)
print(f" MSE : {mse:.4f}")
print(f" RMSE : {rmse:.4f}")
print(f" MAE : {mae:.4f}")
print(f" MAPE : {mape:.2f}%")
print("=" * 40)
```
### 12. Plot Actual vs Predicted Stock Price:
```python3
test_dates = df.index[train_size:] # align dates with test predictions
plt.figure(figsize=(16, 6))
plt.plot(test_dates, y_actual, label='Actual Price', color='steelblue', linewidth=1.5)
plt.plot(test_dates, y_pred, label='Predicted Price', color='tomato', linewidth=1.5, linestyle='--')
plt.title('Google Stock Price: Actual vs Predicted (RNN)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
```
### 13. Forecast Next 30 Days:
```python3
n_future = 30 # number of future days to predict
# Seed the forecast with the last TIME_STEPS days of known data
future_input = data_scaled[-TIME_STEPS:].reshape(1, TIME_STEPS, 1)
future_predictions = []
for _ in range(n_future):
pred = model.predict(future_input, verbose=0)
future_predictions.append(pred[0, 0])
# Slide the window: drop oldest day, append new prediction
future_input = np.append(future_input[:, 1:, :], pred.reshape(1, 1, 1), axis=1)
# Inverse scale forecasted prices back to USD
future_prices = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1))
# Generate business day dates starting from the day after last known date
last_date = df.index[-1]
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=n_future, freq='B')
plt.figure(figsize=(16, 6))
plt.plot(df.index[-120:], scaler.inverse_transform(data_scaled[-120:]),
label='Historical', color='steelblue', linewidth=1.5)
plt.plot(future_dates, future_prices,
label='30-Day Forecast', color='orange', linewidth=1.5)
plt.axvline(x=last_date, color='gray', linestyle='--', label='Forecast Start')
plt.title('Google Stock — 30-Day Future Price Forecast (RNN)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\nForecasted price range: {future_prices.min():.2f} USD - {future_prices.max():.2f} USD")
```
---
## Miscellaneous
- [Dataset source](https://www.kaggle.com/datasets/henryshan/google-stock-price)
---