Configuration and Usage¶

In [1]:

Copied!

from pyspark.sql import SparkSession
from pyspark.sql import SparkSession

In [2]:

Copied!





import ipywidgets as widgets
out = widgets.Output()
with out:
    spark = SparkSession.builder.getOrCreate()
import ipywidgets as widgets
out = widgets.Output()
with out:
    spark = SparkSession.builder.getOrCreate()

Normally IPython only displays the output of the last statement. However it can be handy to run multiple sql magics in a single cell and see the output of each execution. Setting ast_node_interactivity to all will enable that.

In [3]:

Copied!

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:

Copied!

%load_ext jupyterlab_sql_editor.ipython_magic.sparksql
%load_ext jupyterlab_sql_editor.ipython_magic.sparksql

In [5]:

Copied!

%config SparkSql.cacheTTL=3600
%config SparkSql.outputFile="/tmp/sparkdb.schema.json"
%config SparkSql.cacheTTL=3600
%config SparkSql.outputFile="/tmp/sparkdb.schema.json"

In [6]:

Copied!

df = spark.read.json("file:/path/to/contacts.json")
df.createOrReplaceTempView("CONTACTS_TABLE")
df.printSchema()
df = spark.read.json("file:/path/to/contacts.json")
df.createOrReplaceTempView("CONTACTS_TABLE")
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postalCode: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- streetAddress: string (nullable = true)
 |-- age: long (nullable = true)
 |-- first Name: string (nullable = true)
 |-- last Name: string (nullable = true)
 |-- phoneNumbers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- number: string (nullable = true)
 |    |    |-- type: string (nullable = true)

In [7]:

Copied!

df = spark.read.json("file:/path/to/conversations.json")
df.createOrReplaceTempView("MESSAGES_TABLE")
df.printSchema()
df = spark.read.json("file:/path/to/conversations.json")
df.createOrReplaceTempView("MESSAGES_TABLE")
df.printSchema()

root
 |-- first Name: string (nullable = true)
 |-- last Name: string (nullable = true)
 |-- messages: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- body: string (nullable = true)
 |    |    |-- time: string (nullable = true)

In [8]:

Copied!

%sparksql --refresh all
%sparksql --refresh all

Exporting functions: [########################################] 100.0%
Schema file updated: /tmp/sparkdb.schema.json

In [9]:

Copied!

%sparksql SHOW TABLES
%sparksql SHOW TABLES

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

namespace	tableName	isTemporary
	contacts_table	true
	messages_table	true

Execution time: 0.24 seconds

Press tab to trigger auto completions and Ctrl-Q to format cell¶

In [10]:

Copied!





%%sparksql --output grid --limit 1000
SELECT
    id,
    uuid()
FROM
    RANGE (1, 1000)
%%sparksql --output grid --limit 1000
SELECT
    id,
    uuid()
FROM
    RANGE (1, 1000)

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

Execution time: 1.86 seconds

In [11]:

Copied!





%%sparksql --output html --limit 3

SELECT
    con.`first Name`,
    con.phoneNumbers [ 0 ].type as primary_number,
    array_contains(con.phoneNumbers.type, 'home') as flag
FROM
    contacts_table AS con
%%sparksql --output html --limit 3

SELECT
    con.`first Name`,
    con.phoneNumbers [ 0 ].type as primary_number,
    array_contains(con.phoneNumbers.type, 'home') as flag
FROM
    contacts_table AS con

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

first Name	primary_number	flag
Rack	home	true

Execution time: 0.19 seconds

In [12]:

Copied!





%%sparksql --output json --limit 3
SELECT
    *
FROM
    contacts_table AS con
%%sparksql --output json --limit 3
SELECT
    *
FROM
    contacts_table AS con

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

<IPython.core.display.JSON object>

Execution time: 0.19 seconds

In [13]:

Copied!





%%sparksql --output schema
SELECT
    *
FROM
    contacts_table AS con
%%sparksql --output schema
SELECT
    *
FROM
    contacts_table AS con

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postalCode: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- streetAddress: string (nullable = true)
 |-- age: long (nullable = true)
 |-- first Name: string (nullable = true)
 |-- last Name: string (nullable = true)
 |-- phoneNumbers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- number: string (nullable = true)
 |    |    |-- type: string (nullable = true)

Efficient query result reuse across cells¶

A popular use-case is about capturing the output of a cell and use it later in another query. This can be done using a view. Here are the steps required to create a view and then referencing it vi sparksql.

Create a temporary view with the --view option¶

In [14]:

Copied!





%%sparksql --view the_exploded_table --output skip
SELECT
    *,
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
%%sparksql --view the_exploded_table --output skip
SELECT
    *,
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con

Created temporary view `the_exploded_table`
Query execution skipped

In [15]:

Copied!

%sparksql SHOW TABLES
%sparksql SHOW TABLES

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

namespace	tableName	isTemporary
	contacts_table	true
	messages_table	true
	the_exploded_table	true

Execution time: 0.08 seconds

Use temporary view in subsequent queries with autocomplete suggestions¶

In [16]:

Copied!





%%sparksql
SELECT
    *
FROM
    the_exploded_table AS the
%%sparksql
SELECT
    *
FROM
    the_exploded_table AS the

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

address	age	first Name	last Name	phoneNumbers	phoneNumber
{San Jone, 394221, CA, 126}	24	Rack	Jackon	[{7383627627, home}]	{7383627627, home}

Execution time: 0.25 seconds

In [17]:

Copied!





%%sparksql --output text
SELECT
    *
FROM
    the_exploded_table AS the
%%sparksql --output text
SELECT
    *
FROM
    the_exploded_table AS the

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

+---------------------------+---+----------+---------+--------------------+------------------+
|                    address|age|first Name|last Name|        phoneNumbers|       phoneNumber|
+---------------------------+---+----------+---------+--------------------+------------------+
|{San Jone, 394221, CA, 126}| 24|      Rack|   Jackon|[{7383627627, home}]|{7383627627, home}|
+---------------------------+---+----------+---------+--------------------+------------------+

Execution time: 0.09 seconds

In [18]:

Copied!





%%sparksql --output sql
SELECT
    *
FROM
    the_exploded_table AS the
%%sparksql --output sql
SELECT
    *
FROM
    the_exploded_table AS the

Out[18]:

SELECT
    *
FROM
    the_exploded_table AS the

Create a dataframe variable to use in pypark¶

In [19]:

Copied!





%%sparksql --dataframe the_exploded_dataframe --output skip
SELECT
    *,
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
%%sparksql --dataframe the_exploded_dataframe --output skip
SELECT
    *,
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con

Captured dataframe to local variable `the_exploded_dataframe`
Query execution skipped

Continue developing your query using dataframe API¶

In [20]:

Copied!

the_exploded_dataframe.select('phoneNumber').show()
the_exploded_dataframe.select('phoneNumber').show()

+------------------+
|       phoneNumber|
+------------------+
|{7383627627, home}|
+------------------+

Edit SQL within python strings¶

Enjoy the same functionality as a code cell¶

syntax highlighting
code completion
SQL formatting

In [27]:

Copied!





# declare a python string
sql = '''
--start-sparksql
SELECT
    *, con.`first Name`
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql
'''
print(sql)
# declare a python string
sql = '''
--start-sparksql
SELECT
    *, con.`first Name`
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql
'''
print(sql)

--start-sparksql
SELECT
    *, con.`first Name`
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql

In [ ]:

Copied!





# declare a python string
sql = '''
--start-sparksql
SELECT
    *,
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql
'''
print(sql)
# declare a python string
sql = '''
--start-sparksql
SELECT
    *,
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql
'''
print(sql)

In [22]:

Copied!

spark.sql(sql).show()
spark.sql(sql).show()

+--------------------+---+----------+---------+--------------------+------------------+
|             address|age|first Name|last Name|        phoneNumbers|       phoneNumber|
+--------------------+---+----------+---------+--------------------+------------------+
|{San Jone, 394221...| 24|      Rack|   Jackon|[{7383627627, home}]|{7383627627, home}|
+--------------------+---+----------+---------+--------------------+------------------+

In [23]:

Copied!

%%sparksql?
%%sparksql?

Docstring:
::

  %sparksql [-l max_rows] [-r all|local|none] [-d name] [-c] [-e]
                [-v name] [-o sql|json|html|grid|text|schema|skip|none] [-s]
                [-j] [-t max_cell_length]
                [sql [sql ...]]

Magic that works both as %sparksql and as %%sparksql

positional arguments:
  sql                   SQL statement to execute

optional arguments:
  -l max_rows, --limit max_rows
                        The maximum number of rows to display. A value of zero
                        is equivalent to `--output skip`
  -r <all|local|none>, --refresh <all|local|none>
                        Force the regeneration of the schema cache file. The
                        `local` option will only update tables/views created
                        in the local Spark context.
  -d name, --dataframe name
                        Capture dataframe in a local variable named `name`
  -c, --cache           Cache dataframe
  -e, --eager           Cache dataframe with eager load
  -v name, --view name  Create or replace a temporary view named `name`
  -o <sql|json|html|grid|text|schema|skip|none>, --output <sql|json|html|grid|text|schema|skip|none>
                        Output format. Defaults to html. The `sql` option
                        prints the SQL statement that will be executed (useful
                        to test jinja templated statements)
  -s, --show-nonprinting
                        Replace none printable characters with their ascii
                        codes (LF -> )
  -j, --jinja           Enable Jinja templating support
  -t max_cell_length, --truncate max_cell_length
                        Truncate output
File:      /data/dev/jupyterlab-sql-editor/venv/lib/python3.8/site-packages/jupyterlab_sql_editor/ipython_magic/sparksql/sparksql.py

Long running query show progress bar and link to Spark UI¶

In [24]:

Copied!





%%sparksql --limit 1 --output grid
SELECT
    id,
    rand() AS f1,
    rand() AS f2,
    rand() AS f3,
    rand() AS f4,
    rand() AS f5,
    TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
    (1, 400000, 1, 100)
UNION
SELECT
    id,
    rand() AS f1,
    rand() AS f2,
    rand() AS f3,
    rand() AS f4,
    rand() AS f5,
    TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
    (1, 40000, 1, 100)
%%sparksql --limit 1 --output grid
SELECT
    id,
    rand() AS f1,
    rand() AS f2,
    rand() AS f3,
    rand() AS f4,
    rand() AS f5,
    TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
    (1, 400000, 1, 100)
UNION
SELECT
    id,
    rand() AS f1,
    rand() AS f2,
    rand() AS f3,
    rand() AS f4,
    rand() AS f5,
    TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
    (1, 40000, 1, 100)

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

only showing top 1 row

Execution time: 10.18 seconds

In [26]:

Copied!





%%sparksql
SELECT
    mes.`first Name`,
    mes.`last Name`,
    mes.messages,
    mes.messages.body,
    mes.messages.time
FROM
    contacts_table AS con
    INNER JOIN messages_table AS mes ON mes.`first Name` = con.`first Name`
%%sparksql
SELECT
    mes.`first Name`,
    mes.`last Name`,
    mes.messages,
    mes.messages.body,
    mes.messages.time
FROM
    contacts_table AS con
    INNER JOIN messages_table AS mes ON mes.`first Name` = con.`first Name`

SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…

Output()

first Name	last Name	messages	body	time
Rack	Jackon	[{hello, 2022-01-15}, {you there, 2022-01-16}]	[hello, you there]	[2022-01-15, 2022-01-16]

Execution time: 0.15 seconds