Setup & Basics
Eager Lazy Python
pip install polars
import polars as pl
pl.__version__ # check version
# Create DataFrame
lf = pl.DataFrame({
"a":[1,2,3,4],
"b":[10,20,30,40],
"grp":["x","x","y","y"],
})
# Column selector helpers
pl.all() # all columns
pl.col("a") # a single column expr
pl.cols("a","b") # multiple cols by name
pl.selectors.numeric() # numeric columns
I/O (CSV, Parquet, JSON, IPC)
Use read_* for eager, scan_* for lazy (deferred) reading.
# CSV
pl.read_csv("file.csv")
pl.scan_csv("file.csv") # lazy
# Parquet
pl.read_parquet("file.parquet")
pl.scan_parquet("file.parquet")
# JSON / NDJSON
pl.read_json("file.json") # eager (array or records)
pl.read_ndjson("file.ndjson") # newline-delimited
# IPC / Feather
pl.read_ipc("file.arrow")
pl.scan_ipc("file.arrow")
# Write
df.write_csv("out.csv")
df.write_parquet("out.parquet")
df.write_json("out.json") # or write_ndjson
Select & Filter
Use expressions inside select / with_columns / filter.
df.select(pl.col("a"), (pl.col("b")*2).alias("b2"))
# Filter
out = df.filter( (pl.col("a") > 1) & (pl.col("b") <= 30) )
# Select by dtype or name pattern
num = df.select(pl.selectors.numeric())
starts = df.select(pl.col("^prefix_.*$").regex())
# Multi-column ops
df.select(pl.all().sum().suffix("_sum"))
# Row-wise reductions
df.select(pl.max_horizontal(["a","b"]).alias("row_max"))
Derive Columns & Casting
Prefer vectorized expressions. Avoid Python loops.
# Add / transform columns
out = df.with_columns([
(pl.col("a") + pl.col("b")).alias("a_plus_b"),
pl.when(pl.col("a") > 2).then(1).otherwise(0).alias("flag"),
pl.col("grp").cast(pl.Categorical),
])
# Rename / drop
out = df.rename({"a":"A"}).drop(["b"])
# Type casting
out = df.with_columns([
pl.col("a").cast(pl.Int64),
pl.col("b").cast(pl.Float64),
])
Strings
df.with_columns([
pl.col("name").str.to_lowercase().alias("lower"),
pl.col("name").str.contains("regex|pattern").alias("has_pat"),
pl.col("name").str.replace("foo","bar"),
pl.col("name").str.strip_chars(),
pl.col("name").str.len_bytes().alias("len"),
])
# Split / extract
pl.col("path").str.split("/")
pl.col("email").str.extract(r"@([\w.-]+)", group_index=1)
Date & Time
Parse, extract, shift, truncate/round, and handle time zones.
# Parse strings to datetime
pl.col("ts").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S")
# Components
pl.col("ts").dt.year()
pl.col("ts").dt.month()
pl.col("ts").dt.weekday()
pl.col("ts").dt.date()
# Offsets
pl.col("ts").dt.offset_by("+3d")
pl.col("ts").dt.truncate("1h") # floor to 1 hour
pl.col("ts").dt.round("15m") # round to 15 min
# Time zones
pl.col("ts").dt.replace_time_zone("UTC")
pl.col("ts").dt.convert_time_zone("Asia/Kolkata")
Joins
All standard joins, plus asof and cross.
df.join(df2, on="key", how="inner")
df.join(df2, on=["k1","k2"], how="left")
df.join(df2, on="key", how="outer")
# Cross join
df.join(df2, how="cross")
# As-of join (time-aware; df must be sorted by 'ts')
df.join_asof(df2, left_on="ts", right_on="ts", strategy="backward", by="id")
GroupBy & Aggregations
Newer Polars versions prefer group_by (alias: groupby).
# Single agg
out = df.group_by("grp").agg(pl.col("a").sum())
# Multiple aggs
out = df.group_by("grp").agg([
pl.col("a").mean().alias("a_mean"),
pl.col("b").sum().alias("b_sum"),
pl.len().alias("n")
])
# Multi-key group
out = df.group_by(["grp","k"]).agg(pl.all().sum())
# Aggregating multiple columns with same fns
out = df.group_by("grp").agg(
pl.selectors.numeric().sum().suffix("_sum")
)
Window / Rolling
Use over() for partitioned window calcs; rolling_*() for time or index windows.
# Partitioned window
out = df.with_columns([
(pl.col("b") - pl.col("b").mean().over("grp")).alias("b_demean"),
pl.col("b").cum_sum().over("grp").alias("b_cumsum"),
])
# Rolling (by row count)
df.with_columns(
pl.col("b").rolling_mean(window_size=3).alias("b_roll3")
)
# Time-based rolling (requires a time column)
df.sort("ts").with_columns([
pl.col("val").rolling_mean(by="ts", period="7d").alias("mean_7d")
])
Pivot / Melt / Reshape
# Pivot wider
wide = df.pivot(values="val", index="id", columns="metric", aggregate_function="sum")
# Melt (long)
long = df.melt(id_vars=["id"], value_vars=["m1","m2"], variable_name="metric", value_name="val")
# Get dummies (one-hot)
onehot = pl.get_dummies(df, columns=["cat"])
Missing & Nulls
# Detect
pl.col("x").is_null()
pl.col("x").is_nan()
# Fill
out = df.with_columns([
pl.col("x").fill_null(0),
pl.col("y").fill_nan(pl.mean("y")),
pl.coalesce(["x","y", pl.lit(0)]).alias("first_non_null")
])
# Drop rows with any nulls
clean = df.drop_nulls()
# Drop rows where specific subset has nulls
clean2 = df.drop_nulls(["x","y"])
Sort, Unique, Sample
df.sort("col", descending=True)
# Multi-column with per-column order
df.sort(by=["a","b"], descending=[True, False])
# Unique / distinct
uniq_rows = df.unique() # on all cols
uniq_vals = df.select(pl.col("a").unique())
# Counts
vc = df.select(pl.col("a").value_counts())
# Sample
sampled = df.sample(n=100, shuffle=True, seed=42)
frac = df.sample(frac=0.2, with_replacement=False)
Categoricals & One‑Hot
df = df.with_columns(pl.col("cat").cast(pl.Categorical))
# Set ordering (optional)
df = df.with_columns(pl.col("cat").cat.set_ordering("lexical"))
enc = pl.get_dummies(df, columns=["cat"]) # one-hot
UDFs & Conditional Logic
Prefer built-in expressions. Use map_elements for Python UDFs when necessary.
# Conditional with when/then/otherwise
out = df.with_columns(
pl.when(pl.col("a") >= 10).then("big").otherwise("small").alias("size")
)
# Python function (row-wise element mapping)
import math
out = df.with_columns(
pl.col("a").map_elements(lambda x: math.log1p(x)).alias("log1p_a")
)
Concat & Stack
# Vertical (rows)
pl.concat([df1, df2], how="vertical")
# Horizontal (columns)
pl.concat([df1, df2], how="horizontal")
# Diagonal (fill missing with nulls)
pl.concat([df1, df2], how="diagonal")
# With row count
out = df.with_row_count(name="row_id")
Lazy API (scan → optimize → collect)
Build a query plan with scan_* or df.lazy(), then collect().
lf = pl.scan_csv("big.csv") # deferred
q = (
lf.filter(pl.col("a") > 0)
.with_columns((pl.col("b")*2).alias("b2"))
.group_by("grp").agg([
pl.col("a").mean().alias("a_mean"),
pl.col("b2").sum().alias("b2_sum"),
])
.sort("a_mean", descending=True)
)
q.explain() # show optimized plan
res = q.collect(streaming=True) # execute
# Fetch a sample without full execution
preview = q.fetch(10)
Performance Tips
- Prefer scan_* + lazy for large data; enable streaming=True on collect when possible.
- Push work into expressions (no Python loops); avoid map_elements unless required.
- Use projection pruning: select only needed columns early.
- Filter early in lazy plans to reduce data movement.
- Use .explain() to inspect plans; .profile() to measure (if available in your version).
Pandas ↔ Polars (Quick Map)
Handy mapping for common tasks.
Task | Pandas | Polars |
---|---|---|
Read CSV | pd.read_csv("f.csv") | pl.read_csv("f.csv") / pl.scan_csv |
Filter | df[df.a>1] | df.filter(pl.col("a")>1) |
Add Column | df["c"]=df.a+df.b | df.with_columns((pl.col("a")+pl.col("b")).alias("c")) |
Group + Agg | df.groupby("g").sum() | df.group_by("g").agg(pl.all().sum()) |
Pivot | df.pivot_table(...) | df.pivot(...) |
One‑hot | pd.get_dummies | pl.get_dummies |
Rolling | df.rolling(...) | pl.col("x").rolling_mean(...) |