{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "747f31ca",
   "metadata": {},
   "source": [
    "# Capstone Session 9\n",
    "\n",
    "This notebook is generated from the copied `Capstone_Session_9.pdf` directions and the staged `Churn_Modeling.csv` dataset."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e91ba6ae",
   "metadata": {},
   "source": [
    "## Objective\n",
    "\n",
    "Build the required artificial neural network for customer churn prediction, evaluate it on the held-out test set, and score the specified sample customer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a10b94a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:20.470374Z",
     "iopub.status.busy": "2026-05-12T09:42:20.470374Z",
     "iopub.status.idle": "2026-05-12T09:42:24.663364Z",
     "shell.execute_reply": "2026-05-12T09:42:24.662328Z"
    }
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import json\n",
    "import os\n",
    "import sys\n",
    "from urllib.parse import quote\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import tensorflow as tf\n",
    "from IPython.display import display\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
    "\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'\n",
    "tf.keras.utils.set_random_seed(42)\n",
    "\n",
    "IS_COLAB = 'google.colab' in sys.modules\n",
    "GITHUB_REPO_OWNER = 'FrancisBurnet'\n",
    "GITHUB_REPO_NAME = 'francisburnet'\n",
    "GITHUB_REPO_BRANCH = 'main'\n",
    "CAPSTONE_ROOT = Path('Incremental Capstones/Deep Learning Specialization/Capstone Session 9')\n",
    "DATASET_FILENAME = 'Churn_Modeling.csv'\n",
    "\n",
    "\n",
    "def build_raw_github_url(relative_path: Path) -> str:\n",
    "    encoded_path = quote(relative_path.as_posix(), safe='/')\n",
    "    return (\n",
    "        f\"https://raw.githubusercontent.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}/\"\n",
    "        f\"{GITHUB_REPO_BRANCH}/{encoded_path}\"\n",
    "    )\n",
    "\n",
    "\n",
    "def resolve_capstone_dir() -> Path | None:\n",
    "    current = Path.cwd().resolve()\n",
    "    for candidate in [current, *current.parents]:\n",
    "        if candidate.name == CAPSTONE_ROOT.name and (candidate / DATASET_FILENAME).exists():\n",
    "            return candidate\n",
    "        nested_candidate = candidate / CAPSTONE_ROOT\n",
    "        if nested_candidate.exists():\n",
    "            return nested_candidate\n",
    "    return None\n",
    "\n",
    "\n",
    "CAPSTONE_DIR = resolve_capstone_dir()\n",
    "DATASET_URL = build_raw_github_url(CAPSTONE_ROOT / DATASET_FILENAME)\n",
    "\n",
    "if CAPSTONE_DIR is not None:\n",
    "    OUTPUT_ROOT = CAPSTONE_DIR\n",
    "    OUTPUT_MODE = 'permanent capstone outputs'\n",
    "else:\n",
    "    runtime_root = Path('/content/capstone-session-9-runtime') if IS_COLAB else Path.cwd().resolve() / 'capstone-session-9-runtime'\n",
    "    OUTPUT_ROOT = runtime_root\n",
    "    OUTPUT_MODE = 'runtime scratch outputs; export final artifacts back into the capstone outputs folder'\n",
    "\n",
    "OUTPUTS_DIR = (OUTPUT_ROOT / 'outputs').resolve()\n",
    "PLOTS_DIR = OUTPUTS_DIR / 'plots'\n",
    "OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "PLOTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "sns.set_theme(style='whitegrid')\n",
    "pd.set_option('display.max_columns', 100)\n",
    "\n",
    "print('Runtime:', 'Google Colab' if IS_COLAB else 'Local / notebook runtime')\n",
    "print('Capstone directory:', CAPSTONE_DIR if CAPSTONE_DIR is not None else 'Not available in current runtime')\n",
    "print('Dataset source:', DATASET_URL)\n",
    "print('Output mode:', OUTPUT_MODE)\n",
    "print('Outputs directory:', OUTPUTS_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3de91e8d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:24.665747Z",
     "iopub.status.busy": "2026-05-12T09:42:24.665747Z",
     "iopub.status.idle": "2026-05-12T09:42:24.700317Z",
     "shell.execute_reply": "2026-05-12T09:42:24.700317Z"
    }
   },
   "outputs": [],
   "source": [
    "from io import StringIO\n",
    "\n",
    "df = pd.read_csv(DATASET_URL)\n",
    "missing_summary = pd.DataFrame({\n",
    "    'missing_count': df.isna().sum(),\n",
    "    'missing_pct': (df.isna().mean() * 100).round(2),\n",
    "})\n",
    "\n",
    "info_buffer = StringIO()\n",
    "df.info(buf=info_buffer)\n",
    "\n",
    "print('Dataset source used:', DATASET_URL)\n",
    "print('Shape:', df.shape)\n",
    "print('Duplicate rows:', int(df.duplicated().sum()))\n",
    "print(info_buffer.getvalue())\n",
    "display(df.head())\n",
    "display(df.describe().transpose())\n",
    "display(missing_summary)\n",
    "print('Target distribution:', df['Exited'].value_counts().to_dict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c092e5c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:24.702357Z",
     "iopub.status.busy": "2026-05-12T09:42:24.702357Z",
     "iopub.status.idle": "2026-05-12T09:42:24.734556Z",
     "shell.execute_reply": "2026-05-12T09:42:24.733551Z"
    }
   },
   "outputs": [],
   "source": [
    "working_df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname']).copy()\n",
    "X = working_df.drop(columns=['Exited'])\n",
    "y = working_df['Exited']\n",
    "categorical_columns = ['Geography', 'Gender']\n",
    "numeric_columns = [column for column in X.columns if column not in categorical_columns]\n",
    "\n",
    "preprocessor = ColumnTransformer([\n",
    "    ('num', StandardScaler(), numeric_columns),\n",
    "    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),\n",
    "])\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)\n",
    "X_train_processed = preprocessor.fit_transform(X_train)\n",
    "X_test_processed = preprocessor.transform(X_test)\n",
    "print('Processed train shape:', X_train_processed.shape)\n",
    "print('Processed test shape:', X_test_processed.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddfb6060",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:24.736061Z",
     "iopub.status.busy": "2026-05-12T09:42:24.736061Z",
     "iopub.status.idle": "2026-05-12T09:42:33.550982Z",
     "shell.execute_reply": "2026-05-12T09:42:33.550982Z"
    }
   },
   "outputs": [],
   "source": [
    "model = tf.keras.Sequential([\n",
    "    tf.keras.layers.Input(shape=(X_train_processed.shape[1],)),\n",
    "    tf.keras.layers.Dense(6, activation='relu'),\n",
    "    tf.keras.layers.Dense(1, activation='sigmoid'),\n",
    "])\n",
    "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
    "history = model.fit(\n",
    "    X_train_processed,\n",
    "    y_train,\n",
    "    epochs=10,\n",
    "    batch_size=10,\n",
    "    validation_split=0.2,\n",
    "    verbose=0,\n",
    ")\n",
    "pd.DataFrame(history.history).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b620163",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:33.554028Z",
     "iopub.status.busy": "2026-05-12T09:42:33.554028Z",
     "iopub.status.idle": "2026-05-12T09:42:33.720554Z",
     "shell.execute_reply": "2026-05-12T09:42:33.720554Z"
    }
   },
   "outputs": [],
   "source": [
    "test_probabilities = model.predict(X_test_processed, verbose=0).ravel()\n",
    "test_predictions = (test_probabilities >= 0.5).astype(int)\n",
    "test_accuracy = float(accuracy_score(y_test, test_predictions))\n",
    "test_confusion = confusion_matrix(y_test, test_predictions)\n",
    "print('Test accuracy:', round(test_accuracy, 4))\n",
    "print('Confusion matrix:', test_confusion.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0c46142",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:33.723075Z",
     "iopub.status.busy": "2026-05-12T09:42:33.723075Z",
     "iopub.status.idle": "2026-05-12T09:42:34.242237Z",
     "shell.execute_reply": "2026-05-12T09:42:34.241732Z"
    }
   },
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
    "axes[0].plot(history.history['accuracy'], label='train')\n",
    "axes[0].plot(history.history['val_accuracy'], label='validation')\n",
    "axes[0].set_title('Accuracy by Epoch')\n",
    "axes[0].legend()\n",
    "axes[1].plot(history.history['loss'], label='train')\n",
    "axes[1].plot(history.history['val_loss'], label='validation')\n",
    "axes[1].set_title('Loss by Epoch')\n",
    "axes[1].legend()\n",
    "fig.tight_layout()\n",
    "fig.savefig(PLOTS_DIR / 'training_history.png', dpi=150)\n",
    "plt.show()\n",
    "plt.close(fig)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(5, 4))\n",
    "sns.heatmap(test_confusion, annot=True, fmt='d', cmap='Blues', ax=ax)\n",
    "ax.set_title('Confusion Matrix')\n",
    "ax.set_xlabel('Predicted')\n",
    "ax.set_ylabel('Actual')\n",
    "fig.tight_layout()\n",
    "fig.savefig(PLOTS_DIR / 'confusion_matrix.png', dpi=150)\n",
    "plt.show()\n",
    "plt.close(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9f03c95",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:34.244256Z",
     "iopub.status.busy": "2026-05-12T09:42:34.244256Z",
     "iopub.status.idle": "2026-05-12T09:42:34.324233Z",
     "shell.execute_reply": "2026-05-12T09:42:34.323180Z"
    }
   },
   "outputs": [],
   "source": [
    "sample_customer = pd.DataFrame([{\n",
    "    'CreditScore': 600,\n",
    "    'Geography': 'France',\n",
    "    'Gender': 'Male',\n",
    "    'Age': 40,\n",
    "    'Tenure': 3,\n",
    "    'Balance': 60000,\n",
    "    'NumOfProducts': 2,\n",
    "    'HasCrCard': 1,\n",
    "    'IsActiveMember': 1,\n",
    "    'EstimatedSalary': 50000,\n",
    "}])\n",
    "sample_processed = preprocessor.transform(sample_customer)\n",
    "sample_probability = float(model.predict(sample_processed, verbose=0).ravel()[0])\n",
    "sample_prediction = int(sample_probability >= 0.5)\n",
    "sample_decision = 'Do not allow to go' if sample_prediction == 1 else 'Allow to stay'\n",
    "{\n",
    "    'sample_probability': round(sample_probability, 4),\n",
    "    'sample_prediction': sample_prediction,\n",
    "    'sample_decision': sample_decision,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb67d70",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:42:34.326357Z",
     "iopub.status.busy": "2026-05-12T09:42:34.325838Z",
     "iopub.status.idle": "2026-05-12T09:42:34.339111Z",
     "shell.execute_reply": "2026-05-12T09:42:34.338580Z"
    }
   },
   "outputs": [],
   "source": [
    "history_df = pd.DataFrame(history.history)\n",
    "history_df.to_csv(OUTPUTS_DIR / 'session_9_training_history.csv', index=False)\n",
    "prediction_frame = pd.DataFrame({\n",
    "    'actual': y_test.reset_index(drop=True),\n",
    "    'predicted_probability': test_probabilities,\n",
    "    'predicted_label': test_predictions,\n",
    "})\n",
    "prediction_frame.head(100).to_csv(OUTPUTS_DIR / 'session_9_prediction_samples.csv', index=False)\n",
    "summary = {\n",
    "    'dataset_shape': list(df.shape),\n",
    "    'target_distribution': df['Exited'].value_counts().to_dict(),\n",
    "    'processed_feature_count': int(X_train_processed.shape[1]),\n",
    "    'test_accuracy': test_accuracy,\n",
    "    'confusion_matrix': test_confusion.tolist(),\n",
    "    'sample_customer_probability': sample_probability,\n",
    "    'sample_customer_prediction': sample_prediction,\n",
    "    'sample_customer_decision': sample_decision,\n",
    "}\n",
    "with open(OUTPUTS_DIR / 'session_9_summary.json', 'w', encoding='utf-8') as handle:\n",
    "    json.dump(summary, handle, indent=2)\n",
    "summary"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv (3.12.10)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
