{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# initialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Local path to gluten project.\n",
    "gluten_home='/home/sparkuser/gluten'\n",
    "\n",
    "# Local path to gluten jar.\n",
    "gluten_target_jar='/home/sparkuser/gluten-velox-bundle-spark3.3_2.12-centos_7_x86_64-1.3.0.jar'\n",
    "\n",
    "# Spark app master. e.g. 'yarn'\n",
    "master='yarn'\n",
    "\n",
    "# List of workers.\n",
    "clients=['localhost']\n",
    "\n",
    "# List of block devices. e.g. ['nvme1n1', 'nvme2n1']\n",
    "disk_dev=[]\n",
    "\n",
    "# List of network devices. e.g. ['ens787f0']\n",
    "nic_dev=[]\n",
    "\n",
    "# Select workload. Can be either 'tpch' or 'tpcds'.\n",
    "workload='tpch'\n",
    "\n",
    "# Run with gluten. If False, run Spark.\n",
    "run_gluten=True\n",
    "\n",
    "# TPC tables\n",
    "tpch_tabledir=''\n",
    "tpcds_tabledir=''\n",
    "\n",
    "# Parallelism\n",
    "executors_per_node=32\n",
    "cores_per_executor=7\n",
    "\n",
    "gluten_tpch_task_per_core=2\n",
    "gluten_tpcds_task_per_core=4\n",
    "spark_tpch_task_per_core=8\n",
    "spark_tpcds_task_per_core=8\n",
    "\n",
    "# Physical memory on each worker node.\n",
    "memory_per_node='1000g'\n",
    "\n",
    "# Offheap ratio. 0 to disable offheap for Spark.\n",
    "# onheap:offheap = 1:2\n",
    "spark_offheap_ratio=2.0\n",
    "# onheap:offheap = 1:7\n",
    "gluten_offheap_ratio=7.0\n",
    "\n",
    "# spark.io.compression.codec\n",
    "spark_codec='lz4'\n",
    "# spark.gluten.sql.columnar.shuffle.codec\n",
    "gluten_codec='lz4'\n",
    "# spark.gluten.sql.columnar.shuffle.codecBackend\n",
    "gluten_codec_backend=''\n",
    "# spark.gluten.sql.columnar.maxBatchSize\n",
    "max_batch_size=4096\n",
    "# spark.app.name, empty to use default name.\n",
    "app_name=''\n",
    "\n",
    "# Hostname or IP to server for perf analysis. Able to connect via ssh.\n",
    "server=''\n",
    "\n",
    "# Gluten home on server.\n",
    "server_gluten_home='/home/sparkuser/gluten'\n",
    "\n",
    "# Specify the directory on perf analysis server. Usually a codename for this run.\n",
    "base_dir=''\n",
    "\n",
    "# Proxy used to connect to server for perf analysis.\n",
    "proxy=''\n",
    "\n",
    "# Emon event file for `emon -i`. Set to emptry string '' if emon is unavailable.\n",
    "# Supported emon events on platform can be verified via `emon -i emon.list`\n",
    "emon_list=''\n",
    "\n",
    "# Whether to run perf analysis scripts. Only takes effect if server is set.\n",
    "analyze_perf=False\n",
    "\n",
    "# List of email to receive perf analysis results.\n",
    "emails = []\n",
    "\n",
    "# Pull request number.\n",
    "pr=''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "initialize_ipynb = !realpath native_sql_initialize.ipynb\n",
    "print(f\"Running notebook: {initialize_ipynb[0]}\\n\")\n",
    "%run {initialize_ipynb[0]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newClients = []\n",
    "for l in clients:\n",
    "    if l == 'localhost':\n",
    "        newClients.append(localhost)\n",
    "    else:\n",
    "        newClients.append(l)\n",
    "clients = newClients\n",
    "\n",
    "if server == 'localhost':\n",
    "    server = localhost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not app_name and run_gluten:\n",
    "    if pr.isdigit():\n",
    "        app_name = f'PR{pr}'\n",
    "    elif not pr:\n",
    "        app_name = 'main'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%javascript\n",
    "IPython.notebook.kernel.execute('nb_name = \"' + IPython.notebook.notebook_name + '\"')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "deletable": false,
    "editable": false,
    "run_control": {
     "frozen": true
    }
   },
   "outputs": [],
   "source": [
    "nb_name=PAPERMILL_OUTPUT_PATH"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Application Level Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tpch_workload=False\n",
    "tpcds_workload=False\n",
    "\n",
    "if workload.lower() == 'tpch':\n",
    "    tpch_workload=True\n",
    "elif workload.lower() == 'tpcds':\n",
    "    tpcds_workload=True\n",
    "else:\n",
    "    raise ValueError(f\"Unknown workload: {workload}\")\n",
    "\n",
    "def gluten_conf_overwrite(conf):\n",
    "    conf.set('spark.gluten.sql.columnar.shuffle.codec', gluten_codec)\\\n",
    "        .set('spark.gluten.sql.columnar.shuffle.codecBackend', gluten_codec_backend)\\\n",
    "        .set('spark.gluten.sql.columnar.maxBatchSize', max_batch_size)\\\n",
    "        .set('spark.executor.extraJavaOptions',\\\n",
    "            '-XX:+UseParallelOldGC -XX:ParallelGCThreads=2 -XX:NewRatio=1 -XX:SurvivorRatio=1 -XX:+UseCompressedOops -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:ErrorFile=/home/sparkuser/logs/java/hs_err_pid%p.log')\\\n",
    "        .set('spark.gluten.memory.overAcquiredMemoryRatio','0')\\\n",
    "\n",
    "    if tpch_workload:\n",
    "        pass\n",
    "    elif tpcds_workload:\n",
    "        pass\n",
    "    return conf\n",
    "\n",
    "def spark_conf_overwrite(conf):\n",
    "    conf.set('spark.io.compression.codec', spark_codec)\\\n",
    "        .set('spark.executorEnv.LD_LIBRARY_PATH',f\"{os.getenv('HADOOP_HOME')}/lib/native/\") \\\n",
    "        .set('spark.yarn.appMasterEnv.LD_LIBRARY_PATH',f\"{os.getenv('HADOOP_HOME')}/lib/native/\") \\\n",
    "\n",
    "    if tpch_workload:\n",
    "        pass\n",
    "    elif tpcds_workload:\n",
    "        pass\n",
    "    return conf\n",
    "\n",
    "def app_conf_overwrite(conf):\n",
    "    if run_gluten:\n",
    "        return gluten_conf_overwrite(conf)\n",
    "    return spark_conf_overwrite(conf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run Workload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Config and clean pagecache before each run\n",
    "config_pagecache(clients, run_gluten)\n",
    "dropcache(clients)\n",
    "print_kernel_params(clients)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create SparkSession\n",
    "sc, spark, appid, test_tpc=create_cntx(run_gluten, workload, app_conf_overwrite, server, base_dir, nb_name, app_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if run_gluten:\n",
    "    config_mem_cgroup(clients)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_tpc.start_monitor(clients, emon_list=emon_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_tpc.power_run(explain=False, print_result=False, load_table=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_tpc.stop_monitor(clients)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if analyze_perf:\n",
    "    test_tpc.run_perf_analysis(server_gluten_home, disk_dev, nic_dev, proxy, emails, pr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Show Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_tpc.print_result()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "for client in clients:\n",
    "    draw_sar(appid, qtime=test_tpc.result, disk_dev=disk_dev, nic_dev=nic_dev, client=client)"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "428.672px",
    "left": "1339.91px",
    "top": "374.297px",
    "width": "456.969px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}