{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "34a481f5",
   "metadata": {},
   "source": [
    "# Capstone Session 8\n",
    "\n",
    "This notebook is generated from the copied `Capstone_Session_8.pdf` directions and the staged `movies.csv` and `ratings.csv` datasets."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e093c797",
   "metadata": {},
   "source": [
    "## Objective\n",
    "\n",
    "Demonstrate user-based, item-based, and model-based recommendation techniques using the staged movie ratings data."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1cfd759b",
   "metadata": {},
   "source": [
    "## Environment Note\n",
    "\n",
    "This notebook uses `scikit-surprise` directly for the model-based recommendation tasks required by the PDF. In Google Colab, the setup cell installs any missing build dependency and then installs `scikit-surprise` before running `KNNBasic`, `SVD`, and `NMF`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c6ae14e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:34:37.289922Z",
     "iopub.status.busy": "2026-05-12T09:34:37.288416Z",
     "iopub.status.idle": "2026-05-12T09:34:39.378831Z",
     "shell.execute_reply": "2026-05-12T09:34:39.377826Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Runtime: Local / notebook runtime\n",
      "Capstone directory: X:\\SIMPLILEARN\\FrancisBurnetCom\\Incremental Capstones\\Machine Learning Using Python\\Capstone Session 8\n",
      "Movies source: https://raw.githubusercontent.com/FrancisBurnet/francisburnet/main/Incremental%20Capstones/Machine%20Learning%20Using%20Python/Capstone%20Session%208/movies.csv\n",
      "Ratings source: https://raw.githubusercontent.com/FrancisBurnet/francisburnet/main/Incremental%20Capstones/Machine%20Learning%20Using%20Python/Capstone%20Session%208/ratings.csv\n",
      "Output mode: permanent capstone outputs\n",
      "Outputs directory: X:\\SIMPLILEARN\\FrancisBurnetCom\\Incremental Capstones\\Machine Learning Using Python\\Capstone Session 8\\outputs\n",
      "NumPy version: 1.26.4\n",
      "scikit-surprise import ready\n"
     ]
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "import importlib\n",
    "from importlib import metadata as importlib_metadata\n",
    "import json\n",
    "import subprocess\n",
    "import sys\n",
    "from urllib.parse import quote\n",
    "\n",
    "IS_COLAB = 'google.colab' in sys.modules\n",
    "GITHUB_REPO_OWNER = 'FrancisBurnet'\n",
    "GITHUB_REPO_NAME = 'francisburnet'\n",
    "GITHUB_REPO_BRANCH = 'main'\n",
    "CAPSTONE_ROOT = Path('Incremental Capstones/Machine Learning Using Python/Capstone Session 8')\n",
    "MOVIES_FILENAME = 'movies.csv'\n",
    "RATINGS_FILENAME = 'ratings.csv'\n",
    "\n",
    "\n",
    "def build_raw_github_url(relative_path: Path) -> str:\n",
    "    encoded_path = quote(relative_path.as_posix(), safe='/')\n",
    "    return (\n",
    "        f\"https://raw.githubusercontent.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}/\"\n",
    "        f\"{GITHUB_REPO_BRANCH}/{encoded_path}\"\n",
    "    )\n",
    "\n",
    "\n",
    "def resolve_capstone_dir() -> Path | None:\n",
    "    current = Path.cwd().resolve()\n",
    "    capstone_parts = CAPSTONE_ROOT.parts\n",
    "    for candidate in [current, *current.parents]:\n",
    "        if len(candidate.parts) >= len(capstone_parts) and candidate.parts[-len(capstone_parts):] == capstone_parts:\n",
    "            return candidate\n",
    "        nested_candidate = candidate / CAPSTONE_ROOT\n",
    "        if nested_candidate.exists():\n",
    "            return nested_candidate\n",
    "    return None\n",
    "\n",
    "\n",
    "CAPSTONE_DIR = resolve_capstone_dir()\n",
    "MOVIES_URL = build_raw_github_url(CAPSTONE_ROOT / MOVIES_FILENAME)\n",
    "RATINGS_URL = build_raw_github_url(CAPSTONE_ROOT / RATINGS_FILENAME)\n",
    "\n",
    "if CAPSTONE_DIR is not None:\n",
    "    OUTPUT_ROOT = CAPSTONE_DIR\n",
    "    OUTPUT_MODE = 'permanent capstone outputs'\n",
    "    OUTPUT_DISPLAY = (CAPSTONE_ROOT / 'outputs').as_posix()\n",
    "else:\n",
    "    runtime_root = Path('/content/capstone-session-8-runtime') if IS_COLAB else Path.cwd().resolve() / 'capstone-session-8-runtime'\n",
    "    OUTPUT_ROOT = runtime_root\n",
    "    OUTPUT_MODE = 'runtime scratch outputs; export final artifacts back into the capstone outputs folder'\n",
    "    OUTPUT_DISPLAY = 'capstone-session-8-runtime/outputs'\n",
    "\n",
    "OUTPUTS_DIR = (OUTPUT_ROOT / 'outputs').resolve()\n",
    "PLOTS_DIR = OUTPUTS_DIR / 'plots'\n",
    "OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "PLOTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "\n",
    "def installed_version(package_name: str) -> str | None:\n",
    "    try:\n",
    "        return importlib_metadata.version(package_name)\n",
    "    except importlib_metadata.PackageNotFoundError:\n",
    "        return None\n",
    "\n",
    "\n",
    "def surprise_import_ready() -> bool:\n",
    "    try:\n",
    "        importlib.import_module('surprise')\n",
    "        return True\n",
    "    except Exception:\n",
    "        return False\n",
    "\n",
    "\n",
    "numpy_version = installed_version('numpy')\n",
    "needs_numpy_pin = numpy_version is None or int(numpy_version.split('.')[0]) >= 2\n",
    "needs_surprise_setup = needs_numpy_pin or not surprise_import_ready()\n",
    "\n",
    "if needs_surprise_setup:\n",
    "    try:\n",
    "        if IS_COLAB:\n",
    "            subprocess.run(['apt-get', 'update', '-qq'], check=True)\n",
    "            subprocess.run(['apt-get', 'install', '-y', 'build-essential'], check=True)\n",
    "        subprocess.run([sys.executable, '-m', 'pip', 'install', '--force-reinstall', 'numpy<2'], check=True)\n",
    "        subprocess.run([sys.executable, '-m', 'pip', 'install', '--force-reinstall', '--no-deps', 'scikit-surprise'], check=True)\n",
    "        importlib.invalidate_caches()\n",
    "    except subprocess.CalledProcessError as exc:\n",
    "        if not IS_COLAB:\n",
    "            raise RuntimeError(\n",
    "                'Session 8 requires Microsoft Visual C++ Build Tools and a NumPy 1.x runtime for scikit-surprise. '\n",
    "                'Install the Visual Studio C++ workload, then rerun this cell.'\n",
    "            ) from exc\n",
    "        raise\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from IPython.display import display\n",
    "from surprise import Dataset, KNNBasic, NMF as SurpriseNMF, Reader, SVD\n",
    "from surprise.model_selection import KFold as SurpriseKFold, cross_validate\n",
    "\n",
    "sns.set_theme(style='whitegrid')\n",
    "pd.set_option('display.max_columns', 100)\n",
    "\n",
    "print('Runtime:', 'Google Colab' if IS_COLAB else 'Notebook runtime')\n",
    "print('Capstone artifact path:', CAPSTONE_ROOT.as_posix())\n",
    "print('Movies source:', MOVIES_URL)\n",
    "print('Ratings source:', RATINGS_URL)\n",
    "print('Output mode:', OUTPUT_MODE)\n",
    "print('Output target:', OUTPUT_DISPLAY)\n",
    "print('NumPy version:', np.__version__)\n",
    "print('scikit-surprise import ready')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "353f6d23",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:34:39.380336Z",
     "iopub.status.busy": "2026-05-12T09:34:39.380336Z",
     "iopub.status.idle": "2026-05-12T09:34:39.518785Z",
     "shell.execute_reply": "2026-05-12T09:34:39.517779Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982703</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964981247</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982224</td>\n",
       "      <td>Heat (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>47</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964983815</td>\n",
       "      <td>Seven (a.k.a. Se7en) (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964982931</td>\n",
       "      <td>Usual Suspects, The (1995)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating  timestamp                        title\n",
       "0       1        1     4.0  964982703             Toy Story (1995)\n",
       "1       1        3     4.0  964981247      Grumpier Old Men (1995)\n",
       "2       1        6     4.0  964982224                  Heat (1995)\n",
       "3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)\n",
       "4       1       50     5.0  964982931   Usual Suspects, The (1995)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Movies source used: https://raw.githubusercontent.com/FrancisBurnet/francisburnet/main/Incremental%20Capstones/Machine%20Learning%20Using%20Python/Capstone%20Session%208/movies.csv\n",
      "Ratings source used: https://raw.githubusercontent.com/FrancisBurnet/francisburnet/main/Incremental%20Capstones/Machine%20Learning%20Using%20Python/Capstone%20Session%208/ratings.csv\n",
      "Movies shape: (9742, 3)\n",
      "Ratings shape: (100836, 4)\n",
      "Merged shape: (100836, 5)\n",
      "User-item shape: (610, 9719)\n"
     ]
    }
   ],
   "source": [
    "movies = pd.read_csv(MOVIES_URL)\n",
    "ratings = pd.read_csv(RATINGS_URL)\n",
    "merged = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')\n",
    "user_item = merged.pivot_table(index='userId', columns='title', values='rating')\n",
    "display(merged.head())\n",
    "print('Movies source used:', MOVIES_URL)\n",
    "print('Ratings source used:', RATINGS_URL)\n",
    "print('Movies shape:', movies.shape)\n",
    "print('Ratings shape:', ratings.shape)\n",
    "print('Merged shape:', merged.shape)\n",
    "print('User-item shape:', user_item.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b45a83f8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:34:39.520785Z",
     "iopub.status.busy": "2026-05-12T09:34:39.520785Z",
     "iopub.status.idle": "2026-05-12T09:34:48.258715Z",
     "shell.execute_reply": "2026-05-12T09:34:48.257589Z"
    }
   },
   "outputs": [],
   "source": [
    "user_filled = user_item.apply(lambda row: row.fillna(row.mean()), axis=1)\n",
    "user_corr = user_filled.T.corr()\n",
    "user_1_corr = user_corr.loc[1].drop(index=1).dropna().sort_values(ascending=False)\n",
    "top_50_users = user_1_corr.head(50)\n",
    "movie_32_title = movies.loc[movies['movieId'] == 32, 'title'].iloc[0]\n",
    "movie_32_ratings = merged.loc[merged['movieId'] == 32, ['userId', 'rating']].set_index('userId')\n",
    "eligible = top_50_users[top_50_users.index.isin(movie_32_ratings.index)]\n",
    "if eligible.empty:\n",
    "    predicted_user_1_rating = float(merged.loc[merged['movieId'] == 32, 'rating'].mean())\n",
    "else:\n",
    "    weighted_ratings = movie_32_ratings.loc[eligible.index, 'rating']\n",
    "    denominator = float(np.abs(eligible).sum())\n",
    "    predicted_user_1_rating = float(np.dot(eligible.values, weighted_ratings.values) / denominator) if denominator else float(weighted_ratings.mean())\n",
    "\n",
    "top_50_df = top_50_users.reset_index()\n",
    "top_50_df.columns = ['userId', 'correlation']\n",
    "top_50_df.to_csv(OUTPUTS_DIR / 'session_8_top_50_user_correlations.csv', index=False)\n",
    "display(top_50_df.head(10))\n",
    "{'movieId_32_title': movie_32_title, 'predicted_user_1_rating_for_movie_32': round(predicted_user_1_rating, 4)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a74c0e2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:34:48.262220Z",
     "iopub.status.busy": "2026-05-12T09:34:48.260713Z",
     "iopub.status.idle": "2026-05-12T09:37:01.141422Z",
     "shell.execute_reply": "2026-05-12T09:37:01.140320Z"
    }
   },
   "outputs": [],
   "source": [
    "item_filled = user_item.apply(lambda column: column.fillna(column.mean()), axis=0)\n",
    "movie_corr = item_filled.corr()\n",
    "jurassic_title = 'Jurassic Park (1993)'\n",
    "jurassic_similar = movie_corr[jurassic_title].drop(index=jurassic_title).dropna().sort_values(ascending=False).head(10)\n",
    "similar_movies_df = jurassic_similar.reset_index()\n",
    "similar_movies_df.columns = ['title', 'correlation']\n",
    "similar_movies_df.to_csv(OUTPUTS_DIR / 'session_8_similar_movies.csv', index=False)\n",
    "display(similar_movies_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcfe99c8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:37:01.144390Z",
     "iopub.status.busy": "2026-05-12T09:37:01.144390Z",
     "iopub.status.idle": "2026-05-12T09:37:01.170980Z",
     "shell.execute_reply": "2026-05-12T09:37:01.169918Z"
    }
   },
   "outputs": [],
   "source": [
    "reader = Reader(rating_scale=(float(ratings['rating'].min()), float(ratings['rating'].max())))\n",
    "surprise_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)\n",
    "surprise_cv = SurpriseKFold(n_splits=5, random_state=42, shuffle=True)\n",
    "\n",
    "model_specs = [\n",
    "    (\n",
    "        'KNNBasic',\n",
    "        KNNBasic(k=20, sim_options={'name': 'msd', 'user_based': True}),\n",
    "        {'k': 20, 'sim_options': {'name': 'msd', 'user_based': True}},\n",
    "    ),\n",
    "    (\n",
    "        'SVD',\n",
    "        SVD(random_state=42),\n",
    "        {'random_state': 42},\n",
    "    ),\n",
    "    (\n",
    "        'NMF',\n",
    "        SurpriseNMF(random_state=42),\n",
    "        {'random_state': 42},\n",
    "    ),\n",
    "]\n",
    "\n",
    "fold_records = []\n",
    "model_summaries = []\n",
    "for model_name, algorithm, parameters in model_specs:\n",
    "    cv_result = cross_validate(\n",
    "        algorithm,\n",
    "        surprise_data,\n",
    "        measures=['RMSE'],\n",
    "        cv=surprise_cv,\n",
    "        verbose=False,\n",
    "        n_jobs=1,\n",
    "    )\n",
    "    rmse_scores = [float(score) for score in cv_result['test_rmse']]\n",
    "    for fold_index, rmse_score in enumerate(rmse_scores, start=1):\n",
    "        fold_records.append(\n",
    "            {\n",
    "                'fold': fold_index,\n",
    "                'model': model_name,\n",
    "                'rmse': rmse_score,\n",
    "                'parameters': json.dumps(parameters, sort_keys=True),\n",
    "            }\n",
    "        )\n",
    "    model_summaries.append(\n",
    "        {\n",
    "            'model': model_name,\n",
    "            'parameters': parameters,\n",
    "            'rmse': float(np.mean(rmse_scores)),\n",
    "            'best_score': float(np.min(rmse_scores)),\n",
    "        }\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cdc7f4d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:37:01.175392Z",
     "iopub.status.busy": "2026-05-12T09:37:01.174876Z",
     "iopub.status.idle": "2026-05-12T09:38:08.903145Z",
     "shell.execute_reply": "2026-05-12T09:38:08.903145Z"
    }
   },
   "outputs": [],
   "source": [
    "fold_results = pd.DataFrame(fold_records)\n",
    "display(fold_results.head(9))\n",
    "summary_results = pd.DataFrame(model_summaries).sort_values('rmse').reset_index(drop=True)\n",
    "display(summary_results)\n",
    "best_model = summary_results.iloc[0].to_dict()\n",
    "best_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79ebdf60",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-05-12T09:38:08.906321Z",
     "iopub.status.busy": "2026-05-12T09:38:08.905322Z",
     "iopub.status.idle": "2026-05-12T09:38:09.171551Z",
     "shell.execute_reply": "2026-05-12T09:38:09.170542Z"
    }
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(10, 5))\n",
    "bar_colors = ['#1f77b4', '#ff7f0e', '#2ca02c']\n",
    "bars = ax.bar(summary_results['model'], summary_results['rmse'], color=bar_colors)\n",
    "ax.set_title('Session 8 Model-Based RMSE Comparison')\n",
    "ax.set_ylabel('Average 5-Fold RMSE')\n",
    "ax.set_xlabel('Model')\n",
    "ax.bar_label(bars, fmt='%.3f', padding=3)\n",
    "ax.set_ylim(0, summary_results['rmse'].max() + 0.08)\n",
    "fig.tight_layout()\n",
    "fig.savefig(PLOTS_DIR / 'model_based_rmse.png', dpi=150)\n",
    "plt.show()\n",
    "plt.close(fig)\n",
    "\n",
    "fold_results.to_csv(OUTPUTS_DIR / 'session_8_model_cv_results.csv', index=False)\n",
    "summary = {\n",
    "    'movie_id_32_title': movie_32_title,\n",
    "    'predicted_user_1_rating_for_movie_32': round(predicted_user_1_rating, 4),\n",
    "    'top_50_user_correlations_saved': 'session_8_top_50_user_correlations.csv',\n",
    "    'similar_movies_for_jurassic_park': similar_movies_df.to_dict(orient='records'),\n",
    "    'model_cv_results': summary_results.to_dict(orient='records'),\n",
    "    'best_model': best_model,\n",
    "    'environment_note': 'Model-based recommendation is executed with scikit-surprise using KNNBasic, SVD, and NMF.',\n",
    "}\n",
    "with open(OUTPUTS_DIR / 'session_8_summary.json', 'w', encoding='utf-8') as handle:\n",
    "    json.dump(summary, handle, indent=2)\n",
    "summary"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv (3.12.10)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
