Configuration and Usage¶
In [1]:
Copied!
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
In [2]:
Copied!
import ipywidgets as widgets
out = widgets.Output()
with out:
spark = SparkSession.builder.getOrCreate()
import ipywidgets as widgets
out = widgets.Output()
with out:
spark = SparkSession.builder.getOrCreate()
Normally IPython only displays the output of the last statement. However it can be handy to run multiple sql magics in a single cell and see the output of each execution. Setting ast_node_interactivity
to all
will enable that.
In [3]:
Copied!
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [4]:
Copied!
%load_ext jupyterlab_sql_editor.ipython_magic.sparksql
%load_ext jupyterlab_sql_editor.ipython_magic.sparksql
In [5]:
Copied!
%config SparkSql.cacheTTL=3600
%config SparkSql.outputFile="/tmp/sparkdb.schema.json"
%config SparkSql.cacheTTL=3600
%config SparkSql.outputFile="/tmp/sparkdb.schema.json"
In [6]:
Copied!
df = spark.read.json("file:/path/to/contacts.json")
df.createOrReplaceTempView("CONTACTS_TABLE")
df.printSchema()
df = spark.read.json("file:/path/to/contacts.json")
df.createOrReplaceTempView("CONTACTS_TABLE")
df.printSchema()
root |-- address: struct (nullable = true) | |-- city: string (nullable = true) | |-- postalCode: string (nullable = true) | |-- state: string (nullable = true) | |-- streetAddress: string (nullable = true) |-- age: long (nullable = true) |-- first Name: string (nullable = true) |-- last Name: string (nullable = true) |-- phoneNumbers: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- number: string (nullable = true) | | |-- type: string (nullable = true)
In [7]:
Copied!
df = spark.read.json("file:/path/to/conversations.json")
df.createOrReplaceTempView("MESSAGES_TABLE")
df.printSchema()
df = spark.read.json("file:/path/to/conversations.json")
df.createOrReplaceTempView("MESSAGES_TABLE")
df.printSchema()
root |-- first Name: string (nullable = true) |-- last Name: string (nullable = true) |-- messages: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- body: string (nullable = true) | | |-- time: string (nullable = true)
In [8]:
Copied!
%sparksql --refresh all
%sparksql --refresh all
Exporting functions: [########################################] 100.0% Schema file updated: /tmp/sparkdb.schema.json
In [9]:
Copied!
%sparksql SHOW TABLES
%sparksql SHOW TABLES
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
namespace | tableName | isTemporary |
---|---|---|
contacts_table | true | |
messages_table | true |
Execution time: 0.24 seconds
Press tab to trigger auto completions and Ctrl-Q to format cell¶
In [10]:
Copied!
%%sparksql --output grid --limit 1000
SELECT
id,
uuid()
FROM
RANGE (1, 1000)
%%sparksql --output grid --limit 1000
SELECT
id,
uuid()
FROM
RANGE (1, 1000)
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…
Execution time: 1.86 seconds
In [11]:
Copied!
%%sparksql --output html --limit 3
SELECT
con.`first Name`,
con.phoneNumbers [ 0 ].type as primary_number,
array_contains(con.phoneNumbers.type, 'home') as flag
FROM
contacts_table AS con
%%sparksql --output html --limit 3
SELECT
con.`first Name`,
con.phoneNumbers [ 0 ].type as primary_number,
array_contains(con.phoneNumbers.type, 'home') as flag
FROM
contacts_table AS con
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
first Name | primary_number | flag |
---|---|---|
Rack | home | true |
Execution time: 0.19 seconds
In [12]:
Copied!
%%sparksql --output json --limit 3
SELECT
*
FROM
contacts_table AS con
%%sparksql --output json --limit 3
SELECT
*
FROM
contacts_table AS con
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
<IPython.core.display.JSON object>
Execution time: 0.19 seconds
In [13]:
Copied!
%%sparksql --output schema
SELECT
*
FROM
contacts_table AS con
%%sparksql --output schema
SELECT
*
FROM
contacts_table AS con
root |-- address: struct (nullable = true) | |-- city: string (nullable = true) | |-- postalCode: string (nullable = true) | |-- state: string (nullable = true) | |-- streetAddress: string (nullable = true) |-- age: long (nullable = true) |-- first Name: string (nullable = true) |-- last Name: string (nullable = true) |-- phoneNumbers: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- number: string (nullable = true) | | |-- type: string (nullable = true)
In [14]:
Copied!
%%sparksql --view the_exploded_table --output skip
SELECT
*,
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
%%sparksql --view the_exploded_table --output skip
SELECT
*,
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
Created temporary view `the_exploded_table` Query execution skipped
In [15]:
Copied!
%sparksql SHOW TABLES
%sparksql SHOW TABLES
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
namespace | tableName | isTemporary |
---|---|---|
contacts_table | true | |
messages_table | true | |
the_exploded_table | true |
Execution time: 0.08 seconds
Use temporary view in subsequent queries with autocomplete suggestions¶
In [16]:
Copied!
%%sparksql
SELECT
*
FROM
the_exploded_table AS the
%%sparksql
SELECT
*
FROM
the_exploded_table AS the
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
address | age | first Name | last Name | phoneNumbers | phoneNumber |
---|---|---|---|---|---|
{San Jone, 394221, CA, 126} | 24 | Rack | Jackon | [{7383627627, home}] | {7383627627, home} |
Execution time: 0.25 seconds
In [17]:
Copied!
%%sparksql --output text
SELECT
*
FROM
the_exploded_table AS the
%%sparksql --output text
SELECT
*
FROM
the_exploded_table AS the
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
+---------------------------+---+----------+---------+--------------------+------------------+ | address|age|first Name|last Name| phoneNumbers| phoneNumber| +---------------------------+---+----------+---------+--------------------+------------------+ |{San Jone, 394221, CA, 126}| 24| Rack| Jackon|[{7383627627, home}]|{7383627627, home}| +---------------------------+---+----------+---------+--------------------+------------------+
Execution time: 0.09 seconds
In [18]:
Copied!
%%sparksql --output sql
SELECT
*
FROM
the_exploded_table AS the
%%sparksql --output sql
SELECT
*
FROM
the_exploded_table AS the
Out[18]:
SELECT
*
FROM
the_exploded_table AS the
Create a dataframe variable to use in pypark¶
In [19]:
Copied!
%%sparksql --dataframe the_exploded_dataframe --output skip
SELECT
*,
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
%%sparksql --dataframe the_exploded_dataframe --output skip
SELECT
*,
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
Captured dataframe to local variable `the_exploded_dataframe` Query execution skipped
Continue developing your query using dataframe API¶
In [20]:
Copied!
the_exploded_dataframe.select('phoneNumber').show()
the_exploded_dataframe.select('phoneNumber').show()
+------------------+ | phoneNumber| +------------------+ |{7383627627, home}| +------------------+
In [27]:
Copied!
# declare a python string
sql = '''
--start-sparksql
SELECT
*, con.`first Name`
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
--end-sparksql
'''
print(sql)
# declare a python string
sql = '''
--start-sparksql
SELECT
*, con.`first Name`
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
--end-sparksql
'''
print(sql)
--start-sparksql SELECT *, con.`first Name` explode(con.phoneNumbers) as phoneNumber FROM contacts_table AS con --end-sparksql
In [ ]:
Copied!
# declare a python string
sql = '''
--start-sparksql
SELECT
*,
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
--end-sparksql
'''
print(sql)
# declare a python string
sql = '''
--start-sparksql
SELECT
*,
explode(con.phoneNumbers) as phoneNumber
FROM
contacts_table AS con
--end-sparksql
'''
print(sql)
In [22]:
Copied!
spark.sql(sql).show()
spark.sql(sql).show()
+--------------------+---+----------+---------+--------------------+------------------+ | address|age|first Name|last Name| phoneNumbers| phoneNumber| +--------------------+---+----------+---------+--------------------+------------------+ |{San Jone, 394221...| 24| Rack| Jackon|[{7383627627, home}]|{7383627627, home}| +--------------------+---+----------+---------+--------------------+------------------+
In [23]:
Copied!
%%sparksql?
%%sparksql?
Docstring: :: %sparksql [-l max_rows] [-r all|local|none] [-d name] [-c] [-e] [-v name] [-o sql|json|html|grid|text|schema|skip|none] [-s] [-j] [-t max_cell_length] [sql [sql ...]] Magic that works both as %sparksql and as %%sparksql positional arguments: sql SQL statement to execute optional arguments: -l max_rows, --limit max_rows The maximum number of rows to display. A value of zero is equivalent to `--output skip` -r <all|local|none>, --refresh <all|local|none> Force the regeneration of the schema cache file. The `local` option will only update tables/views created in the local Spark context. -d name, --dataframe name Capture dataframe in a local variable named `name` -c, --cache Cache dataframe -e, --eager Cache dataframe with eager load -v name, --view name Create or replace a temporary view named `name` -o <sql|json|html|grid|text|schema|skip|none>, --output <sql|json|html|grid|text|schema|skip|none> Output format. Defaults to html. The `sql` option prints the SQL statement that will be executed (useful to test jinja templated statements) -s, --show-nonprinting Replace none printable characters with their ascii codes (LF -> ) -j, --jinja Enable Jinja templating support -t max_cell_length, --truncate max_cell_length Truncate output File: /data/dev/jupyterlab-sql-editor/venv/lib/python3.8/site-packages/jupyterlab_sql_editor/ipython_magic/sparksql/sparksql.py
Long running query show progress bar and link to Spark UI¶
In [24]:
Copied!
%%sparksql --limit 1 --output grid
SELECT
id,
rand() AS f1,
rand() AS f2,
rand() AS f3,
rand() AS f4,
rand() AS f5,
TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
(1, 400000, 1, 100)
UNION
SELECT
id,
rand() AS f1,
rand() AS f2,
rand() AS f3,
rand() AS f4,
rand() AS f5,
TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
(1, 40000, 1, 100)
%%sparksql --limit 1 --output grid
SELECT
id,
rand() AS f1,
rand() AS f2,
rand() AS f3,
rand() AS f4,
rand() AS f5,
TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
(1, 400000, 1, 100)
UNION
SELECT
id,
rand() AS f1,
rand() AS f2,
rand() AS f3,
rand() AS f4,
rand() AS f5,
TRANSFORM(SEQUENCE(1, 512), x -> rand()) AS data -- array of 512 floats
FROM
RANGE
(1, 40000, 1, 100)
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…
only showing top 1 row
Execution time: 10.18 seconds
In [26]:
Copied!
%%sparksql
SELECT
mes.`first Name`,
mes.`last Name`,
mes.messages,
mes.messages.body,
mes.messages.time
FROM
contacts_table AS con
INNER JOIN messages_table AS mes ON mes.`first Name` = con.`first Name`
%%sparksql
SELECT
mes.`first Name`,
mes.`last Name`,
mes.messages,
mes.messages.body,
mes.messages.time
FROM
contacts_table AS con
INNER JOIN messages_table AS mes ON mes.`first Name` = con.`first Name`
SparkSchemaWidget(nodes=(Node(close_icon='angle-down', close_icon_style='danger', icon='project-diagram', icon…
Output()
first Name | last Name | messages | body | time |
---|---|---|---|---|
Rack | Jackon | [{hello, 2022-01-15}, {you there, 2022-01-16}] | [hello, you there] | [2022-01-15, 2022-01-16] |
Execution time: 0.15 seconds