🐻 Polars Cheat Sheet — Eager & Lazy (Python)

Fast, expressive DataFrame ops for data prep, feature engineering, joins, windows, pivots, strings, dates, and more. Copy-ready snippets.

Works offline · Single file

Setup & Basics

Eager Lazy Python

pip install polars

import polars as pl
pl.__version__  # check version

# Create DataFrame
lf = pl.DataFrame({
    "a":[1,2,3,4],
    "b":[10,20,30,40],
    "grp":["x","x","y","y"],
})

# Column selector helpers
pl.all()            # all columns
pl.col("a")         # a single column expr
pl.cols("a","b")   # multiple cols by name
pl.selectors.numeric()  # numeric columns

I/O (CSV, Parquet, JSON, IPC)

Use read_* for eager, scan_* for lazy (deferred) reading.

# CSV
pl.read_csv("file.csv")
pl.scan_csv("file.csv")  # lazy

# Parquet
pl.read_parquet("file.parquet")
pl.scan_parquet("file.parquet")

# JSON / NDJSON
pl.read_json("file.json")            # eager (array or records)
pl.read_ndjson("file.ndjson")        # newline-delimited

# IPC / Feather
pl.read_ipc("file.arrow")
pl.scan_ipc("file.arrow")

# Write
df.write_csv("out.csv")
df.write_parquet("out.parquet")
df.write_json("out.json")            # or write_ndjson

Select & Filter

Use expressions inside select / with_columns / filter.

df.select(pl.col("a"), (pl.col("b")*2).alias("b2"))

# Filter
out = df.filter( (pl.col("a") > 1) & (pl.col("b") <= 30) )

# Select by dtype or name pattern
num = df.select(pl.selectors.numeric())
starts = df.select(pl.col("^prefix_.*$").regex())

# Multi-column ops
df.select(pl.all().sum().suffix("_sum"))

# Row-wise reductions
df.select(pl.max_horizontal(["a","b"]).alias("row_max"))

Derive Columns & Casting

Prefer vectorized expressions. Avoid Python loops.

# Add / transform columns
out = df.with_columns([
    (pl.col("a") + pl.col("b")).alias("a_plus_b"),
    pl.when(pl.col("a") > 2).then(1).otherwise(0).alias("flag"),
    pl.col("grp").cast(pl.Categorical),
])

# Rename / drop
out = df.rename({"a":"A"}).drop(["b"])

# Type casting
out = df.with_columns([
    pl.col("a").cast(pl.Int64),
    pl.col("b").cast(pl.Float64),
])

Strings

df.with_columns([
    pl.col("name").str.to_lowercase().alias("lower"),
    pl.col("name").str.contains("regex|pattern").alias("has_pat"),
    pl.col("name").str.replace("foo","bar"),
    pl.col("name").str.strip_chars(),
    pl.col("name").str.len_bytes().alias("len"),
])

# Split / extract
pl.col("path").str.split("/")
pl.col("email").str.extract(r"@([\w.-]+)", group_index=1)

Date & Time

Parse, extract, shift, truncate/round, and handle time zones.

# Parse strings to datetime
pl.col("ts").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S")

# Components
pl.col("ts").dt.year()
pl.col("ts").dt.month()
pl.col("ts").dt.weekday()
pl.col("ts").dt.date()

# Offsets
pl.col("ts").dt.offset_by("+3d")
pl.col("ts").dt.truncate("1h")    # floor to 1 hour
pl.col("ts").dt.round("15m")       # round to 15 min

# Time zones
pl.col("ts").dt.replace_time_zone("UTC")
pl.col("ts").dt.convert_time_zone("Asia/Kolkata")

Joins

All standard joins, plus asof and cross.

df.join(df2, on="key", how="inner")
df.join(df2, on=["k1","k2"], how="left")
df.join(df2, on="key", how="outer")

# Cross join
df.join(df2, how="cross")

# As-of join (time-aware; df must be sorted by 'ts')
df.join_asof(df2, left_on="ts", right_on="ts", strategy="backward", by="id")

GroupBy & Aggregations

Newer Polars versions prefer group_by (alias: groupby).

# Single agg
out = df.group_by("grp").agg(pl.col("a").sum())

# Multiple aggs
out = df.group_by("grp").agg([
    pl.col("a").mean().alias("a_mean"),
    pl.col("b").sum().alias("b_sum"),
    pl.len().alias("n")
])

# Multi-key group
out = df.group_by(["grp","k"]).agg(pl.all().sum())

# Aggregating multiple columns with same fns
out = df.group_by("grp").agg(
    pl.selectors.numeric().sum().suffix("_sum")
)

Window / Rolling

Use over() for partitioned window calcs; rolling_*() for time or index windows.

# Partitioned window
out = df.with_columns([
    (pl.col("b") - pl.col("b").mean().over("grp")).alias("b_demean"),
    pl.col("b").cum_sum().over("grp").alias("b_cumsum"),
])

# Rolling (by row count)
df.with_columns(
    pl.col("b").rolling_mean(window_size=3).alias("b_roll3")
)

# Time-based rolling (requires a time column)
df.sort("ts").with_columns([
    pl.col("val").rolling_mean(by="ts", period="7d").alias("mean_7d")
])

Pivot / Melt / Reshape

# Pivot wider
wide = df.pivot(values="val", index="id", columns="metric", aggregate_function="sum")

# Melt (long)
long = df.melt(id_vars=["id"], value_vars=["m1","m2"], variable_name="metric", value_name="val")

# Get dummies (one-hot)
onehot = pl.get_dummies(df, columns=["cat"])

Missing & Nulls

# Detect
pl.col("x").is_null()
pl.col("x").is_nan()

# Fill
out = df.with_columns([
    pl.col("x").fill_null(0),
    pl.col("y").fill_nan(pl.mean("y")),
    pl.coalesce(["x","y", pl.lit(0)]).alias("first_non_null")
])

# Drop rows with any nulls
clean = df.drop_nulls()
# Drop rows where specific subset has nulls
clean2 = df.drop_nulls(["x","y"])

Sort, Unique, Sample

df.sort("col", descending=True)

# Multi-column with per-column order
df.sort(by=["a","b"], descending=[True, False])

# Unique / distinct
uniq_rows = df.unique()                 # on all cols
uniq_vals = df.select(pl.col("a").unique())

# Counts
vc = df.select(pl.col("a").value_counts())

# Sample
sampled = df.sample(n=100, shuffle=True, seed=42)
frac = df.sample(frac=0.2, with_replacement=False)

Categoricals & One‑Hot

df = df.with_columns(pl.col("cat").cast(pl.Categorical))

# Set ordering (optional)
df = df.with_columns(pl.col("cat").cat.set_ordering("lexical"))

enc = pl.get_dummies(df, columns=["cat"])  # one-hot

UDFs & Conditional Logic

Prefer built-in expressions. Use map_elements for Python UDFs when necessary.

# Conditional with when/then/otherwise
out = df.with_columns(
    pl.when(pl.col("a") >= 10).then("big").otherwise("small").alias("size")
)

# Python function (row-wise element mapping)
import math
out = df.with_columns(
    pl.col("a").map_elements(lambda x: math.log1p(x)).alias("log1p_a")
)

Concat & Stack

# Vertical (rows)
pl.concat([df1, df2], how="vertical")

# Horizontal (columns)
pl.concat([df1, df2], how="horizontal")

# Diagonal (fill missing with nulls)
pl.concat([df1, df2], how="diagonal")

# With row count
out = df.with_row_count(name="row_id")

Lazy API (scan → optimize → collect)

Build a query plan with scan_* or df.lazy(), then collect().

lf = pl.scan_csv("big.csv")  # deferred

q = (
  lf.filter(pl.col("a") > 0)
    .with_columns((pl.col("b")*2).alias("b2"))
    .group_by("grp").agg([
        pl.col("a").mean().alias("a_mean"),
        pl.col("b2").sum().alias("b2_sum"),
    ])
    .sort("a_mean", descending=True)
)

q.explain()                   # show optimized plan
res = q.collect(streaming=True)  # execute

# Fetch a sample without full execution
preview = q.fetch(10)

Performance Tips

  • Prefer scan_* + lazy for large data; enable streaming=True on collect when possible.
  • Push work into expressions (no Python loops); avoid map_elements unless required.
  • Use projection pruning: select only needed columns early.
  • Filter early in lazy plans to reduce data movement.
  • Use .explain() to inspect plans; .profile() to measure (if available in your version).

Pandas ↔ Polars (Quick Map)

Handy mapping for common tasks.

TaskPandasPolars
Read CSVpd.read_csv("f.csv")pl.read_csv("f.csv") / pl.scan_csv
Filterdf[df.a>1]df.filter(pl.col("a")>1)
Add Columndf["c"]=df.a+df.bdf.with_columns((pl.col("a")+pl.col("b")).alias("c"))
Group + Aggdf.groupby("g").sum()df.group_by("g").agg(pl.all().sum())
Pivotdf.pivot_table(...)df.pivot(...)
One‑hotpd.get_dummiespl.get_dummies
Rollingdf.rolling(...)pl.col("x").rolling_mean(...)