{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Spark DataFrame 10mins\n", "\n", "아파치 스파크는 In-memory 기반 범용 대용량 데이터 처리 엔진입니다.\n", "\n", "빅데이터 처리를 위해 처음 많이 쓰였던 Hadoop Map/Reduce 는 Map/Reduce 처리시 디스크 I/O로 인해서 속도의 제약이 발생하게 되어있는데 반해 스파크는 읽어온 데이터를 메모리에 저장하여 DAG(Directed Acyclic Graph) 연산을 통해 재사용하기에 Map/Reduce 대비 처리 속도 10x~100x 배 바르게 데이터 처리가 가능합니다.\n", "\n", "![](https://qph.fs.quoracdn.net/main-qimg-dddb2a8c5f004e3c1b981a10f62221df)\n", "\n", "![](https://ichi.pro/assets/images/max/724/0*hMZqY5YpmBuVoN_c)\n", "\n", "그 외에도 Scala, Java 그리고 특히 Python, R API를 지원하고 데이터 처리시 Dataframe 및 SQL 문법을 지원함으로써 개발 편의성 또한 갖추고 있습니다. 그리고 Map/Reduce 와 같이 효율적인 분산 프로그램을 짜기 위해 많은 것들을 고려할 필요 없이 단지 싱글 프로그램에서 API 를 이용하듯 개발하면 분산 클러스터에서 수행시 자동으로 분산 처리가 된다는 장점이 있습니다.\n", "\n", "이러한 특징은 분산 클러스터 뿐만 아니라 멀티 코어 머신에서도 진가를 발휘 합니다. 일반적으로 Python 및 R 언어는 싱글 코어 만을 사용하며 멀티코어 프로그래밍을 하는 것이 까다롭지만, 스파크를 이용하면 이러한 걱정 없이 멀티 코어를 모두 활용하여 데이터를 처리할 수 있습니다. \n", "\n", "참조\n", "* [PySpark DataFrame/SQL Chear Sheet](https://www.datacamp.com/community/blog/pyspark-sql-cheat-sheet)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "spark_home: /Users/comafire/Projects/skp_n4e_jupyter_sts/usr/spark-3.2.0-bin-hadoop3.2\n" ] }, { "data": { "text/html": [ "\n", "
\n", "

SparkSession - in-memory

\n", " \n", "
\n", "

SparkContext

\n", "\n", "

Spark UI

\n", "\n", "
\n", "
Version
\n", "
v3.2.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", "
spark
\n", "
\n", "
\n", " \n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from pyspark.sql import SparkSession\n", "\n", "# Python Version Mismatch Error 일 경우, os.environ 으로 직접 설정 후 실행\n", "# Exception: Python in worker has different version 2.7 than that in driver 3.5, \n", "# PySpark cannot run with different minor versions.Please check environment variables \n", "# PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n", "\n", "# os.environ[\"PYSPARK_PYTHON\"] = \"/usr/bin/python3\"\n", "# os.environ[\"PYSPARK_DRIVER_PYTHON\"] = \"/usr/bin/python3\"\n", "\n", "spark_home = os.environ.get('SPARK_HOME', None)\n", "print(f\"spark_home: {spark_home}\")\n", "\n", "spark = SparkSession.builder.master(\"local[*]\").appName(\"spark\")\n", "spark = spark.config(\"spark.driver.memory\", \"1g\")\n", "spark = spark.config(\"spark.executor.memory\", \"1g\")\n", "spark = spark.config(\"spark.python.worker.memory\", \"1g\")\n", "spark = spark.getOrCreate()\n", "\n", "sc = spark.sparkContext\n", "\n", "spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame 생성" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+-----------+----+----------+----+---------+-----+----+----+------+\n", "| Id| Model|Year|ScreenSize| RAM| HDD| W| D| H|Weight|\n", "+---+-----------+----+----------+----+---------+-----+----+----+------+\n", "| 1|MacBook Pro|2015| 15\"|16GB|512GB SSD|13.75|9.48|0.61| 4.02|\n", "| 2| MacBook|2016| 12\"| 8GB|256GB SSD|11.04|7.74|0.52| 2.03|\n", "| 3|MacBook Air|2016| 13.3\"| 8GB|128GB SSD| 12.8|8.94|0.68| 2.96|\n", "| 4| iMac|2017| 27\"|64GB| 1TB SSD| 25.6| 8.0|20.3| 20.8|\n", "+---+-----------+----+----------+----+---------+-----+----+----+------+\n", "\n" ] } ], "source": [ "# import pyspark class\n", "from pyspark.sql import *\n", "from pyspark.sql import functions as f\n", "from pyspark.sql import types as t\n", "\n", "\n", "# SparkContext 를 이용해 생성\n", "s = sc.parallelize([\n", " (1, 'MacBook Pro', 2015, '15\"', '16GB', '512GB SSD', 13.75, 9.48, 0.61, 4.02)\n", " , (2, 'MacBook', 2016, '12\"', '8GB', '256GB SSD', 11.04, 7.74, 0.52, 2.03)\n", " , (3, 'MacBook Air', 2016, '13.3\"', '8GB', '128GB SSD', 12.8, 8.94, 0.68, 2.96)\n", " , (4, 'iMac', 2017, '27\"', '64GB', '1TB SSD', 25.6, 8.0, 20.3, 20.8)\n", "])\n", "\n", "columns = ['Id', 'Model', 'Year', 'ScreenSize', 'RAM', 'HDD', 'W', 'D', 'H', 'Weight']\n", "df = spark.createDataFrame(s, columns)\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Row(id='123456', name='Computer Science')\n", "Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000)\n", "Row(department=Row(id='123456', name='Computer Science'), employees=[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)])\n", "Row(department=Row(id='345678', name='Theater and Drama'), employees=[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(firstName=None, lastName='wendell', email='no-reply@berkeley.edu', salary=160000)])\n" ] } ], "source": [ "# 직접 Row 클래스를 사용해서 행 생성\n", "department1 = Row(id='123456', name='Computer Science')\n", "department2 = Row(id='789012', name='Mechanical Engineering')\n", "department3 = Row(id='345678', name='Theater and Drama')\n", "department4 = Row(id='901234', name='Indoor Recreation')\n", "\n", "# Class 를 사용해서 행 생성\n", "Employee = Row(\"firstName\", \"lastName\", \"email\", \"salary\")\n", "employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)\n", "employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)\n", "employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)\n", "employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)\n", "\n", "# Create the DepartmentWithEmployees instances from Departments and Employees\n", "Group = Row(\"department\", \"employees\")\n", "group1 = Group(department1, [employee1, employee2])\n", "group2 = Group(department2, [employee3, employee4])\n", "\n", "group3 = Row(department=department3, employees=[employee1, employee4])\n", "group4 = Row(department=department4, employees=[employee2, employee3])\n", "\n", "print(department1)\n", "print(employee1)\n", "print(group1)\n", "print(group3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+\n", "| department| employees|\n", "+--------------------+--------------------+\n", "|{123456, Computer...|[{michael, armbru...|\n", "|{789012, Mechanic...|[{matei, null, no...|\n", "+--------------------+--------------------+\n", "\n", "+---------------------------+----------------------------------------------------------------------------------------------------+\n", "|department |employees |\n", "+---------------------------+----------------------------------------------------------------------------------------------------+\n", "|{345678, Theater and Drama}|[{michael, armbrust, no-reply@berkeley.edu, 100000}, {null, wendell, no-reply@berkeley.edu, 160000}]|\n", "|{901234, Indoor Recreation}|[{xiangrui, meng, no-reply@stanford.edu, 120000}, {matei, null, no-reply@waterloo.edu, 140000}] |\n", "+---------------------------+----------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ "s1 = [group1, group2]\n", "df1 = spark.createDataFrame(s1)\n", "df1.show()\n", "\n", "s2 = [group3, group4]\n", "df2 = spark.createDataFrame(s2)\n", "df2.show(df2.count(), False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------------------+-----------------------------------------------------------------------------------------------------+\n", "|department |employees |\n", "+--------------------------------+-----------------------------------------------------------------------------------------------------+\n", "|{123456, Computer Science} |[{michael, armbrust, no-reply@berkeley.edu, 100000}, {xiangrui, meng, no-reply@stanford.edu, 120000}]|\n", "|{789012, Mechanical Engineering}|[{matei, null, no-reply@waterloo.edu, 140000}, {null, wendell, no-reply@berkeley.edu, 160000}] |\n", "|{345678, Theater and Drama} |[{michael, armbrust, no-reply@berkeley.edu, 100000}, {null, wendell, no-reply@berkeley.edu, 160000}] |\n", "|{901234, Indoor Recreation} |[{xiangrui, meng, no-reply@stanford.edu, 120000}, {matei, null, no-reply@waterloo.edu, 140000}] |\n", "+--------------------------------+-----------------------------------------------------------------------------------------------------+\n", "\n", "None\n" ] } ], "source": [ "df_union = df1.unionAll(df2)\n", "print(df_union.show(df_union.count(), False))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "only showing top 3 rows\n", "\n" ] } ], "source": [ "df_explode = df_union.select(f.col(\"department\").alias(\"d\"), f.explode(\"employees\").alias(\"e\"))\n", "df_explode = df_explode.selectExpr(\"d.id\", \"d.name as departmentName\", \"e.firstName\", \"e.lastName\", \"e.email\", \"e.salary\")\n", "#df_explode = df_explode.select(f.col(\"d.id\"), f.col(\"d.name\").alias(\"departmentName\"), f.col(\"e.firstName\"), f.col(\"e.lastName\"), f.col(\"e.email\"), f.col(\"e.salary\"))\n", "\n", "df_explode.show(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read/Write Parquet" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "only showing top 3 rows\n", "\n" ] } ], "source": [ "# https://docs.databricks.com/spark/latest/data-sources/read-parquet.html\n", "\n", "path_explode = \"/tmp/df_explode.parquet\"\n", "df = df_explode\n", "\n", "df = df.repartition(1)\n", "df.write.mode('overwrite').parquet(path_explode)\n", "df = spark.read.parquet(path_explode)\n", "df.show(3)\n", "\n", "# explode를 사용할 때, explode 대상이 되는 값이 빈 리스트([])라면, 해당 Row 는 제거됩니다. \n", "# 만약 Row 는 유지하고 값만 null 로 처리하려면 Spark 2.2+ 부터 지원하는 explode_outer 함수를 사용하면 됩니다." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read/Write CSV" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "only showing top 3 rows\n", "\n" ] } ], "source": [ "# https://docs.databricks.com/spark/latest/data-sources/read-csv.html\n", "\n", "path_explode = \"/tmp/df_explode.csv\"\n", "df = df_explode\n", "\n", "df = df.repartition(1)\n", "df.write.format(\"csv\").mode('overwrite').option(\"header\", \"true\").save(path_explode)\n", "df = spark.read.format(\"csv\").option(\"header\", \"true\").option(\"inferSchema\", \"true\").load(path_explode)\n", "df.show(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read/Write JSON" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+---------+------+--------+------+\n", "| departmentName| email|firstName| id|lastName|salary|\n", "+--------------------+--------------------+---------+------+--------+------+\n", "| Computer Science|no-reply@berkeley...| michael|123456|armbrust|100000|\n", "| Computer Science|no-reply@stanford...| xiangrui|123456| meng|120000|\n", "|Mechanical Engine...|no-reply@waterloo...| matei|789012| null|140000|\n", "+--------------------+--------------------+---------+------+--------+------+\n", "only showing top 3 rows\n", "\n" ] } ], "source": [ "#https://docs.databricks.com/spark/latest/data-sources/read-json.html\n", " \n", "path_explode = \"/tmp/df_explode.json\"\n", "df = df_explode\n", "\n", "df = df.repartition(1)\n", "df.write.format(\"json\").mode('overwrite').save(path_explode)\n", "df = spark.read.format(\"json\").load(path_explode)\n", "df.show(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert to Pandas" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " id departmentName firstName lastName email \\\n", "0 123456 Computer Science michael armbrust no-reply@berkeley.edu \n", "1 123456 Computer Science xiangrui meng no-reply@stanford.edu \n", "2 789012 Mechanical Engineering matei None no-reply@waterloo.edu \n", "3 789012 Mechanical Engineering None wendell no-reply@berkeley.edu \n", "4 345678 Theater and Drama michael armbrust no-reply@berkeley.edu \n", "\n", " salary \n", "0 100000 \n", "1 120000 \n", "2 140000 \n", "3 160000 \n", "4 100000 \n", "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "\n", "None\n" ] } ], "source": [ "import pandas as pd\n", "\n", "df_pandas = df_explode.toPandas()\n", "print(df_pandas.head())\n", "\n", "df_spark = spark.createDataFrame(df_pandas)\n", "print(df_spark.show())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame 정보보기" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- id: string (nullable = true)\n", " |-- departmentName: string (nullable = true)\n", " |-- firstName: string (nullable = true)\n", " |-- lastName: string (nullable = true)\n", " |-- email: string (nullable = true)\n", " |-- salary: long (nullable = true)\n", "\n", "None\n", "StructType(List(StructField(id,StringType,true),StructField(departmentName,StringType,true),StructField(firstName,StringType,true),StructField(lastName,StringType,true),StructField(email,StringType,true),StructField(salary,LongType,true)))\n", "['id', 'departmentName', 'firstName', 'lastName', 'email', 'salary']\n", "[('id', 'string'), ('departmentName', 'string'), ('firstName', 'string'), ('lastName', 'string'), ('email', 'string'), ('salary', 'bigint')]\n" ] } ], "source": [ "df = df_explode\n", "\n", "# DataFrame 컬럼 정보 살펴보기\n", "print(df.printSchema())\n", "print(df.schema)\n", "print(df.columns)\n", "print(df.dtypes)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "only showing top 3 rows\n", "\n", "+------+----------------------+---------+--------+---------------------+------+\n", "|id |departmentName |firstName|lastName|email |salary|\n", "+------+----------------------+---------+--------+---------------------+------+\n", "|123456|Computer Science |michael |armbrust|no-reply@berkeley.edu|100000|\n", "|123456|Computer Science |xiangrui |meng |no-reply@stanford.edu|120000|\n", "|789012|Mechanical Engineering|matei |null |no-reply@waterloo.edu|140000|\n", "+------+----------------------+---------+--------+---------------------+------+\n", "only showing top 3 rows\n", "\n", "Row(id='123456', departmentName='Computer Science', firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000)\n", "[Row(id='123456', departmentName='Computer Science', firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(id='123456', departmentName='Computer Science', firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)]\n", "[Row(id='123456', departmentName='Computer Science', firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(id='123456', departmentName='Computer Science', firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)]\n", "8\n", "+------+\n", "| id|\n", "+------+\n", "|123456|\n", "|789012|\n", "|901234|\n", "|345678|\n", "+------+\n", "\n", "None\n", "4\n" ] } ], "source": [ "df.show(3)\n", "df.show(3, False)\n", "print(df.first())\n", "print(df.head(2))\n", "print(df.take(2))\n", "print(df.count())\n", "print(df.select(\"id\").distinct().show())\n", "print(df.select(\"id\").distinct().count())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame 조작하기\n", "\n", "### Selection" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+------+\n", "| id| departmentName|firstName|salary|\n", "+------+--------------------+---------+------+\n", "|123456| Computer Science| michael|100000|\n", "|123456| Computer Science| xiangrui|120000|\n", "|789012|Mechanical Engine...| matei|140000|\n", "|789012|Mechanical Engine...| null|160000|\n", "|345678| Theater and Drama| michael|100000|\n", "|345678| Theater and Drama| null|160000|\n", "|901234| Indoor Recreation| xiangrui|120000|\n", "|901234| Indoor Recreation| matei|140000|\n", "+------+--------------------+---------+------+\n", "\n", "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "\n" ] } ], "source": [ "# 열 선택\n", "df.select(\"id\", \"departmentName\", \"firstName\", \"salary\").show()\n", "\n", "# 중복 제거\n", "df.drop_duplicates(subset = ['firstName']).show()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+------+\n", "| id|salary|\n", "+------+------+\n", "|789012|160000|\n", "|345678|160000|\n", "+------+------+\n", "\n", "+------+------+\n", "| id|salary|\n", "+------+------+\n", "|789012|160000|\n", "|345678|160000|\n", "+------+------+\n", "\n", "+------+------+\n", "| id|salary|\n", "+------+------+\n", "|123456|100000|\n", "|123456|120000|\n", "|789012|140000|\n", "|345678|100000|\n", "|901234|120000|\n", "|901234|140000|\n", "+------+------+\n", "\n" ] } ], "source": [ "# 조건을 통한 선택\n", "# Where, Filter\n", "df.select(\"id\", \"salary\").filter(df[\"salary\"] > 140000).show()\n", "df.select(\"id\", \"salary\").where(f.col(\"salary\") > 140000).show()\n", "\n", "# Between\n", "df.select(\"id\", \"salary\").where(df[\"salary\"].between(10000, 140000)).show()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+----------------+\n", "| id| departmentName|\n", "+------+----------------+\n", "|123456|Computer Science|\n", "|123456|Computer Science|\n", "+------+----------------+\n", "\n", "+------+-----------------+\n", "| id| departmentName|\n", "+------+-----------------+\n", "|901234|Indoor Recreation|\n", "|901234|Indoor Recreation|\n", "+------+-----------------+\n", "\n", "+------+-----------------+\n", "| id| departmentName|\n", "+------+-----------------+\n", "|345678|Theater and Drama|\n", "|345678|Theater and Drama|\n", "+------+-----------------+\n", "\n", "+------+-----------------+\n", "| id| departmentName|\n", "+------+-----------------+\n", "|123456| Computer Science|\n", "|123456| Computer Science|\n", "|901234|Indoor Recreation|\n", "|901234|Indoor Recreation|\n", "+------+-----------------+\n", "\n" ] } ], "source": [ "# Like\n", "df.select(\"id\", \"departmentName\").where(df['departmentName'].like(\"%Com%\")).show()\n", "\n", "# Startswith, endswith\n", "df.select(\"id\", \"departmentName\").where(df['departmentName'].startswith(\"Indoor\")).show()\n", "df.select(\"id\", \"departmentName\").where(df['departmentName'].endswith(\"Drama\")).show()\n", "\n", "# isin\n", "df.select(\"id\", \"departmentName\").where(df[\"departmentName\"].isin(\"Computer Science\", \"Indoor Recreation\")).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setting" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+------+-------+\n", "| id|salary| bonus|\n", "+------+------+-------+\n", "|123456|100000|50000.0|\n", "|123456|120000|60000.0|\n", "+------+------+-------+\n", "only showing top 2 rows\n", "\n", "+------+------+-------+\n", "| id|salary| bonus|\n", "+------+------+-------+\n", "|123456|100000|50000.0|\n", "|123456|120000|60000.0|\n", "+------+------+-------+\n", "only showing top 2 rows\n", "\n", "+------+----------------+------+\n", "| id| departmentName|substr|\n", "+------+----------------+------+\n", "|123456|Computer Science| Com|\n", "|123456|Computer Science| Com|\n", "+------+----------------+------+\n", "only showing top 2 rows\n", "\n", "+------+------+----+\n", "| id|salary|cost|\n", "+------+------+----+\n", "|123456|100000| Low|\n", "|123456|120000| Low|\n", "|789012|140000|High|\n", "+------+------+----+\n", "only showing top 3 rows\n", "\n" ] } ], "source": [ "# 연산을 통한 컬럼 생성\n", "df.select(\"id\", \"salary\", (df[\"salary\"] * 0.5).alias(\"bonus\")).show(2)\n", "df.select(\"id\", \"salary\").withColumn(\"bonus\", df[\"salary\"] * 0.5).show(2)\n", "df.select(\"id\", \"departmentName\", (df[\"departmentName\"].substr(1, 3)).alias(\"substr\")).show(2)\n", "\n", "# 조건을 통한 컬럼 생성\n", "df.select(\"id\", \"salary\", f.when(df[\"salary\"] > 120000, \"High\").otherwise(\"Low\").alias(\"cost\")).show(3)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+-------+\n", "| id| departmentName|firstName|lastName| email|salary| bonus|\n", "+------+--------------------+---------+--------+--------------------+------+-------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|50000.0|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|60000.0|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|70000.0|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|80000.0|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|50000.0|\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|80000.0|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|60000.0|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|70000.0|\n", "+------+--------------------+---------+--------+--------------------+------+-------+\n", "\n", "+------+--------------------+---------+--------+--------------------+------+-------+\n", "| id| departmentName|firstName|lastName| email|salary| bonus|\n", "+------+--------------------+---------+--------+--------------------+------+-------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|50000.0|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|60000.0|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|70000.0|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|80000.0|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|50000.0|\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|80000.0|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|60000.0|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|70000.0|\n", "+------+--------------------+---------+--------+--------------------+------+-------+\n", "\n" ] } ], "source": [ "# 사용자 함수를 통한 컬럼 생성\n", "# Lambda 함수 방식\n", "bonus = f.udf(lambda x, y: x * y, t.FloatType())\n", "df.withColumn('bonus', bonus(df['salary'], f.lit(0.5))).show()\n", "\n", "# Annotation 방식\n", "@f.udf('float')\n", "def bonus(x, y):\n", " return x * y\n", "\n", "df.withColumn('bonus', bonus(df['salary'], f.lit(0.5))).show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---+---+---+---+\n", "| c1| c2| c3| c4| c5|\n", "+---+---+---+---+---+\n", "| 1| 2| 3| 4| 5|\n", "| 1| 2| 3| 4| 5|\n", "| 1| 2| 3| 4| 5|\n", "+---+---+---+---+---+\n", "\n", "+---+---+---+---+---+-------+\n", "| c1| c2| c3| c4| c5|sum_two|\n", "+---+---+---+---+---+-------+\n", "| 1| 2| 3| 4| 5| 3|\n", "| 1| 2| 3| 4| 5| 3|\n", "| 1| 2| 3| 4| 5| 3|\n", "+---+---+---+---+---+-------+\n", "\n", "+---+---+---+---+---+-------+\n", "| c1| c2| c3| c4| c5|sum_all|\n", "+---+---+---+---+---+-------+\n", "| 1| 2| 3| 4| 5| 15|\n", "| 1| 2| 3| 4| 5| 15|\n", "| 1| 2| 3| 4| 5| 15|\n", "+---+---+---+---+---+-------+\n", "\n", "+---+---+---+---+---+--------+\n", "| c1| c2| c3| c4| c5|sum_some|\n", "+---+---+---+---+---+--------+\n", "| 1| 2| 3| 4| 5| 6|\n", "| 1| 2| 3| 4| 5| 6|\n", "| 1| 2| 3| 4| 5| 6|\n", "+---+---+---+---+---+--------+\n", "\n" ] } ], "source": [ "df1 = sc.parallelize([\n", " [1, 2, 3, 4, 5], \n", " [1, 2, 3, 4, 5], \n", " [1, 2, 3, 4, 5], \n", "]).toDF(['c1', 'c2', 'c3', 'c4', 'c5'])\n", "df1.show()\n", "\n", "# Array 방식을 통한 여러 컬럼 연산\n", "def sum_two(*args):\n", " return args[0] + args[1]\n", "\n", "udf_sum_two = f.udf(sum_two, t.IntegerType())\n", " \n", "def sum_all(*args):\n", " return sum(args)\n", "\n", "udf_sum_all = f.udf(sum_all, t.IntegerType())\n", "\n", "\n", "# Sum Two Columns c1 + c2\n", "df1.withColumn(\"sum_two\", udf_sum_two(f.col(\"c1\"), f.col(\"c2\"))).show()\n", "\n", "# Sum All Columns\n", "df1.withColumn(\"sum_all\", udf_sum_all(*[f.col(i) for i in df1.columns])).show()\n", "\n", "# Sum Some Columns\n", "df1.withColumn(\"sum_some\", udf_sum_all(f.col(\"c1\"), f.col(\"c2\"), f.col(\"c3\"))).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Missing Value" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+---------+--------+\n", "| id|firstName|lastName|\n", "+------+---------+--------+\n", "|123456| michael|armbrust|\n", "|123456| xiangrui| meng|\n", "|789012| matei| null|\n", "|789012| null| wendell|\n", "|345678| michael|armbrust|\n", "|345678| null| wendell|\n", "|901234| xiangrui| meng|\n", "|901234| matei| null|\n", "+------+---------+--------+\n", "\n", "+------+---------+\n", "| id|firstName|\n", "+------+---------+\n", "|123456| michael|\n", "|123456| xiangrui|\n", "|789012| matei|\n", "|345678| michael|\n", "|901234| xiangrui|\n", "|901234| matei|\n", "+------+---------+\n", "\n", "+------+---------+\n", "| id|firstName|\n", "+------+---------+\n", "|789012| null|\n", "|345678| null|\n", "+------+---------+\n", "\n" ] } ], "source": [ "df.select(\"id\", \"firstName\", \"lastName\").show()\n", "\n", "# 결측치 체크 \n", "df.select(\"id\", \"firstName\").where(df[\"firstName\"].isNotNull()).show()\n", "df.select(\"id\", \"firstName\").where(df[\"firstName\"].isNull()).show()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+---------+--------+\n", "| id|firstName|lastName|\n", "+------+---------+--------+\n", "|123456| michael|armbrust|\n", "|123456| xiangrui| meng|\n", "|789012| matei| Unknown|\n", "|789012| Unknown| wendell|\n", "|345678| michael|armbrust|\n", "|345678| Unknown| wendell|\n", "|901234| xiangrui| meng|\n", "|901234| matei| Unknown|\n", "+------+---------+--------+\n", "\n" ] } ], "source": [ "# 상수로 체우기\n", "df.select(\"id\", \"firstName\", \"lastName\").fillna({ 'firstName': 'Unknown', 'lastName': 'Unknown' }).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Operation" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+\n", "| ID| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "\n", "+------+--------------------+---------+--------+------+\n", "| id| departmentName|firstName|lastName|salary|\n", "+------+--------------------+---------+--------+------+\n", "|123456| Computer Science| michael|armbrust|100000|\n", "|123456| Computer Science| xiangrui| meng|120000|\n", "|789012|Mechanical Engine...| matei| null|140000|\n", "|789012|Mechanical Engine...| null| wendell|160000|\n", "|345678| Theater and Drama| michael|armbrust|100000|\n", "|345678| Theater and Drama| null| wendell|160000|\n", "|901234| Indoor Recreation| xiangrui| meng|120000|\n", "|901234| Indoor Recreation| matei| null|140000|\n", "+------+--------------------+---------+--------+------+\n", "\n", "n rows: 8\n", "+-------+-----------------+-----------------+---------+--------+--------------------+------------------+\n", "|summary| id| departmentName|firstName|lastName| email| salary|\n", "+-------+-----------------+-----------------+---------+--------+--------------------+------------------+\n", "| count| 8| 8| 6| 6| 8| 8|\n", "| mean| 539845.0| null| null| null| null| 130000.0|\n", "| stddev|339649.7466592818| null| null| null| null|23904.572186687874|\n", "| min| 123456| Computer Science| matei|armbrust|no-reply@berkeley...| 100000|\n", "| max| 901234|Theater and Drama| xiangrui| wendell|no-reply@waterloo...| 160000|\n", "+-------+-----------------+-----------------+---------+--------+--------------------+------------------+\n", "\n" ] } ], "source": [ "# 컬럼 이름 변경\n", "df.withColumnRenamed('id', 'ID').show()\n", "\n", "# 컬럼 삭제\n", "df.drop(\"email\").show()\n", "\n", "# 통계 확인\n", "print(\"n rows: {}\".format(df.count()))\n", "df.describe().show()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+----------+----------+\n", "| departmentName|tot_salary|avg_salary|\n", "+--------------------+----------+----------+\n", "| Theater and Drama| 260000| 130000.0|\n", "|Mechanical Engine...| 300000| 150000.0|\n", "| Computer Science| 220000| 110000.0|\n", "| Indoor Recreation| 260000| 130000.0|\n", "+--------------------+----------+----------+\n", "\n", "+--------------------+--------+--------+--------+--------+\n", "| departmentName| null| matei| michael|xiangrui|\n", "+--------------------+--------+--------+--------+--------+\n", "| Theater and Drama|160000.0| null|100000.0| null|\n", "|Mechanical Engine...|160000.0|140000.0| null| null|\n", "| Computer Science| null| null|100000.0|120000.0|\n", "| Indoor Recreation| null|140000.0| null|120000.0|\n", "+--------------------+--------+--------+--------+--------+\n", "\n" ] } ], "source": [ "# GroupBy\n", "df.groupBy(\"departmentName\").agg(f.sum('salary').alias('tot_salary'), f.mean('salary').alias('avg_salary')).show()\n", "\n", "# Pivot\n", "df.groupBy(\"departmentName\").pivot('firstName').agg(f.mean('salary').alias('avg_salary')).show()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "\n", "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "\n", "+------+--------------------+---------+--------+--------------------+------+\n", "| id| departmentName|firstName|lastName| email|salary|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "|345678| Theater and Drama| null| wendell|no-reply@berkeley...|160000|\n", "|789012|Mechanical Engine...| null| wendell|no-reply@berkeley...|160000|\n", "|789012|Mechanical Engine...| matei| null|no-reply@waterloo...|140000|\n", "|901234| Indoor Recreation| matei| null|no-reply@waterloo...|140000|\n", "|123456| Computer Science| xiangrui| meng|no-reply@stanford...|120000|\n", "|901234| Indoor Recreation| xiangrui| meng|no-reply@stanford...|120000|\n", "|123456| Computer Science| michael|armbrust|no-reply@berkeley...|100000|\n", "|345678| Theater and Drama| michael|armbrust|no-reply@berkeley...|100000|\n", "+------+--------------------+---------+--------+--------------------+------+\n", "\n" ] } ], "source": [ "# Sort\n", "df.sort(df['salary'].desc(), df['id'].asc()).show()\n", "df.sort([\"salary\", \"id\"], ascending=[False, True]).show()\n", "df.orderBy([\"salary\", \"id\"],ascending=[0, 1]).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Merge" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+---------+\n", "|a_id| a_name|\n", "+----+---------+\n", "| 1| Pirate|\n", "| 2| Monkey|\n", "| 3| Ninja|\n", "| 4|Spaghetti|\n", "+----+---------+\n", "\n", "+----+-----------+\n", "|b_id| b_name|\n", "+----+-----------+\n", "| 1| Rutabaga|\n", "| 2| Pirate|\n", "| 3| Ninja|\n", "| 4|Darth Vader|\n", "+----+-----------+\n", "\n", "+----+------+----+------+\n", "|a_id|a_name|b_id|b_name|\n", "+----+------+----+------+\n", "| 3| Ninja| 3| Ninja|\n", "| 1|Pirate| 2|Pirate|\n", "+----+------+----+------+\n", "\n" ] } ], "source": [ "sa = [(1, 'Pirate'),(2, 'Monkey'),(3, 'Ninja'),(4, 'Spaghetti')]\n", "df_a = spark.createDataFrame(sa, ['a_id','a_name'])\n", " \n", "sb = [(1, 'Rutabaga'),(2, 'Pirate'),(3, 'Ninja'),(4, 'Darth Vader')]\n", "df_b = spark.createDataFrame(sb, ['b_id','b_name'])\n", " \n", "df_a.show()\n", "df_b.show()\n", "\n", "# Join; Inner\n", "df_join = df_a.alias('a').join(df_b.alias('b'), f.col(\"a.a_name\") == f.col(\"b.b_name\"), 'inner')\n", "df_join = df_join.select(f.col(\"a.*\"), f.col(\"b.*\"))\n", "df_join.show()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+---------+----+------+\n", "|a_id| a_name|b_id|b_name|\n", "+----+---------+----+------+\n", "| 4|Spaghetti|null| null|\n", "| 3| Ninja| 3| Ninja|\n", "| 1| Pirate| 2|Pirate|\n", "| 2| Monkey|null| null|\n", "+----+---------+----+------+\n", "\n", "+----+------+----+-----------+\n", "|a_id|a_name|b_id| b_name|\n", "+----+------+----+-----------+\n", "|null| null| 1| Rutabaga|\n", "| 3| Ninja| 3| Ninja|\n", "| 1|Pirate| 2| Pirate|\n", "|null| null| 4|Darth Vader|\n", "+----+------+----+-----------+\n", "\n", "+----+---------+----+-----------+\n", "|a_id| a_name|b_id| b_name|\n", "+----+---------+----+-----------+\n", "|null| null| 1| Rutabaga|\n", "| 4|Spaghetti|null| null|\n", "| 3| Ninja| 3| Ninja|\n", "| 1| Pirate| 2| Pirate|\n", "| 2| Monkey|null| null|\n", "|null| null| 4|Darth Vader|\n", "+----+---------+----+-----------+\n", "\n" ] } ], "source": [ "# Join: Left\n", "df_join = df_a.alias('a').join(df_b.alias('b'), f.col(\"a.a_name\") == f.col(\"b.b_name\"), 'left')\n", "df_join = df_join.select(f.col(\"a.*\"), f.col(\"b.*\"))\n", "df_join.show()\n", "\n", "# Join: Left\n", "df_join = df_a.alias('a').join(df_b.alias('b'), f.col(\"a.a_name\") == f.col(\"b.b_name\"), 'right')\n", "df_join = df_join.select(f.col(\"a.*\"), f.col(\"b.*\"))\n", "df_join.show()\n", "\n", "# Join: Full\n", "df_join = df_a.alias('a').join(df_b.alias('b'), f.col(\"a.a_name\") == f.col(\"b.b_name\"), 'full')\n", "df_join = df_join.select(f.col(\"a.*\"), f.col(\"b.*\"))\n", "df_join.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SQL Query" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+----------+----------+\n", "| departmentName|tot_salary|avg_salary|\n", "+--------------------+----------+----------+\n", "| Theater and Drama| 260000| 130000.0|\n", "|Mechanical Engine...| 300000| 150000.0|\n", "| Computer Science| 220000| 110000.0|\n", "| Indoor Recreation| 260000| 130000.0|\n", "+--------------------+----------+----------+\n", "\n" ] } ], "source": [ "# register the DataFrame as a temp table so that we can query it using SQL\n", "df.registerTempTable(\"df_example\")\n", "\n", "# Perform the same query as the DataFrame above and return ``explain``\n", "df_table = spark.sql(\"SELECT departmentName, SUM(salary) AS tot_salary, AVG(salary) AS avg_salary FROM df_example GROUP BY departmentName\")\n", "df_table.show()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.9 64-bit ('3.8.9')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9" }, "nikola": { "category": "", "date": "2019-03-01", "description": "", "link": "", "slug": "de-spark-dataframe-10mins", "tags": "", "title": "Data Engineering - Apache Spark Dataframe 10분만에 훑어보기", "type": "text" }, "notebookId": 3566706889179044, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true }, "vscode": { "interpreter": { "hash": "81b98c941daad6509d4c89342869ca88f891260205139ff875cb30be518b42c4" } } }, "nbformat": 4, "nbformat_minor": 4 }