A Coding Tutorial on Datashader on Rendering Massive Datasets with High-Performance Python Visual Analytics

import subprocess, sys
subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “-q”,
“datashader”, “colorcet”, “numba”, “scipy”])

import numpy as np
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf
from datashader import reductions as rd
import colorcet as cc
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.gridspec import GridSpec
from scipy.stats import multivariate_normal
import time, warnings
warnings.filterwarnings(“ignore”)

print(“Datashader version:”, ds.__version__)

def show(img, title=””, ax=None, figsize=(6, 5)):
standalone = ax is None
if standalone:
fig, ax = plt.subplots(figsize=figsize)
rgba = img.to_pil()
ax.imshow(rgba, origin=”upper”, aspect=”auto”)
ax.set_title(title, fontsize=11, fontweight=”bold”)
ax.axis(“off”)
if standalone:
plt.tight_layout()
plt.show()

print(“\n=== SECTION 1: Core Pipeline ===”)

rng = np.random.default_rng(42)
N = 2_000_000

x = np.concatenate([rng.normal(-1, 0.5, N//3),
rng.normal( 1, 0.5, N//3),
rng.normal( 0, 1.5, N//3)])
y = np.concatenate([rng.normal(-1, 0.5, N//3),
rng.normal( 1, 0.5, N//3),
rng.normal( 0, 0.5, N//3)])
df_base = pd.DataFrame({“x”: x, “y”: y})

canvas = ds.Canvas(plot_width=600, plot_height=500,
x_range=(-4, 4), y_range=(-4, 4))

agg = canvas.points(df_base, “x”, “y”, agg=rd.count())

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
combos = [
(“Linear / blues”, tf.shade(agg, cmap=cc.blues, how=”linear”)),
(“Log / fire”, tf.shade(agg, cmap=cc.fire, how=”log” )),
(“Eq-hist / bmy”, tf.shade(agg, cmap=cc.bmy, how=”eq_hist”)),
]
for ax, (title, img) in zip(axes, combos):
show(img, title, ax=ax)
plt.suptitle(“Section 1 – 2 M points: Linear vs Log vs Eq-Hist normalisation”,
fontsize=13, fontweight=”bold”)
plt.tight_layout()
plt.show()

print(“\n=== SECTION 2: Reduction Types ===”)

n_actual = len(df_base)
df_base[“value”] = rng.exponential(scale=2, size=n_actual)
df_base[“label”] = pd.Categorical(
rng.choice([“A”, “B”, “C”], size=n_actual),
categories=[“A”, “B”, “C”]
)

canvas2 = ds.Canvas(plot_width=400, plot_height=350,
x_range=(-4, 4), y_range=(-4, 4))

reductions_cfg = [
(“count()”, rd.count(), cc.kbc),
(“sum(value)”, rd.sum(“value”), cc.CET_L3),
(“mean(value)”, rd.mean(“value”), cc.CET_D4),
(“std(value)”, rd.std(“value”), cc.CET_L16),
(“min(value)”, rd.min(“value”), cc.CET_L17),
(“max(value)”, rd.max(“value”), cc.bgyw),
(“var(value)”, rd.var(“value”), cc.CET_L18),
(“count_cat(label)”, rd.count_cat(“label”), None),
]

fig, axes = plt.subplots(2, 4, figsize=(18, 9))
axes = axes.flat

for ax, (name, agg_fn, cmap) in zip(axes, reductions_cfg):
agg_r = canvas2.points(df_base, “x”, “y”, agg=agg_fn)
if cmap is None:
img = tf.shade(agg_r, color_key={“A”:”#e41a1c”,”B”:”#377eb8″,”C”:”#4daf4a”})
else:
img = tf.shade(agg_r, cmap=cmap, how=”eq_hist”)
show(img, name, ax=ax)

plt.suptitle(“Section 2 – All Reduction Types on 2 M points”, fontsize=14, fontweight=”bold”)
plt.tight_layout()
plt.show()

print(“\n=== SECTION 3: Categorical Visualisation ===”)

N_cat = 500_000
categories = [“Cluster A”, “Cluster B”, “Cluster C”, “Cluster D”]
centers = [(-2, -2), (-2, 2), (2, -2), (2, 2)]
colors = {“Cluster A”:”#e41a1c”,”Cluster B”:”#377eb8″,
“Cluster C”:”#4daf4a”,”Cluster D”:”#ff7f00″}

frames = []
for cat, (cx, cy) in zip(categories, centers):
n = N_cat // len(categories)
frames.append(pd.DataFrame({
“x”: rng.normal(cx, 0.8, n),
“y”: rng.normal(cy, 0.8, n),
“cat”: pd.Categorical([cat]*n, categories=categories),
}))
df_cat = pd.concat(frames, ignore_index=True)

canvas3 = ds.Canvas(plot_width=500, plot_height=500,
x_range=(-5, 5), y_range=(-5, 5))
agg_cat = canvas3.points(df_cat, “x”, “y”, agg=rd.count_cat(“cat”))

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

img_raw = tf.shade(agg_cat, color_key=colors)
show(img_raw, “Raw (no spread)”, ax=axes[0])

img_sp1 = tf.spread(tf.shade(agg_cat, color_key=colors), px=1)
show(img_sp1, “Spread px=1″, ax=axes[1])

img_bg = tf.set_background(tf.shade(agg_cat, color_key=colors), color=”black”)
show(img_bg, “Black background”, ax=axes[2])

for cat, col in colors.items():
axes[2].plot([], [], “o”, color=col, label=cat, markersize=8)
axes[2].legend(loc=”lower right”, fontsize=8, framealpha=0.6)

plt.suptitle(“Section 3 – Categorical Rendering (500 k points)”, fontsize=13, fontweight=”bold”)
plt.tight_layout()
plt.show()

Source link

A Coding Tutorial on Datashader on Rendering Massive Datasets with High-Performance Python Visual Analytics

Subscribe To Our Newsletter

You have Successfully Subscribed!

A Coding Tutorial on Datashader on Rendering Massive Datasets with High-Performance Python Visual Analytics

You may also like

Subscribe To Our Newsletter

You have Successfully Subscribed!