The Problem
You have multiple input files produced by different processes. You want to read, process, and save derived artifacts while capturing.
Let’s simulate the data and folder structure:
if (requireNamespace("pkgload", quietly = TRUE)) {
pkgload::load_all(".")
} else {
library(stamp)
}
#> ℹ Loading stamp
library(data.table)
set.seed(123)
# Create an isolated temporary root so all writes occur outside the package tree.
# Use a short name to avoid long path issues on Windows (PATH_MAX = 260).
root_dir <- fs::path_temp("s") |>
fs::dir_create()
old_wd <- setwd(root_dir)
on.exit(setwd(old_wd), add = TRUE)
# Initialize stamp under this temp root
st_init(root = root_dir)
#> ✔ stamp initialized
#> root: /tmp/RtmpSn2PpI/s
#> state: /tmp/RtmpSn2PpI/s/.stamp
# Define subdirectories (they will be created as needed)
welfare_dir <- fs::path(root_dir, "data", "welfare")
macro_dir <- fs::path(root_dir, "data", "macro")
population_dir <- macro_dir
outputs_dir <- fs::path(root_dir, "outputs") |>
fs::dir_create()
welfare_parts_dir <- fs::path(root_dir, "data", "welfare_parts")
summary_parts_dir <- fs::path(root_dir, "outputs", "summary_parts")
fs::dir_create(c(welfare_dir, macro_dir, welfare_parts_dir, summary_parts_dir))
# Helper: list only data files with known extensions to avoid locks/sidecars
data_files <- function(dir) {
f <- fs::dir_ls(dir, type = "file")
ext <- tolower(fs::path_ext(f))
allowed <- c("qs2", "qs", "rds", "csv", "fst", "json")
f[ext %in% allowed]
}The Solution
Harmonizing Economic Indicators with Versioning & Lineage
We walk through a realistic data engineering scenario: several country/year micro datasets and macro indicators must be combined into a standardized output table. We want:
- Consistent saving, versioning, and timestamp metadata for each input artifact.
- Explicit primary keys (PK) recorded on disk.
- A derived summary table saved with full lineage (parents listed).
- Demonstrating how a single input change propagates using
st_is_stale(),st_plan_rebuild(), andst_rebuild(). - Adding new data (ARG 2015) and observing minimal recomputation.
- Producing the same output using partitioned artifacts for partial re-runs.
Countries/years for welfare (micro) data: COL2010, COL2012, MEX2010, MEX2015, PRY2011, PRY2014.
-
Reporting levels:
- COL and MEX: both
urbanandrural. - PRY: only
national.
- COL and MEX: both
Macro data (CPI, GDP, population) exist for COL, MEX, PRY, BRA, ARG (2010–2015). CPI & population have all three reporting levels (
national,urban,rural) for every country-year.
1. Simulate and Save Input Micro Data
We’ll create small synthetic micro datasets with columns:
country, year, reporting_level,
hh_id, welfare, weight.
fs::dir_create(welfare_dir)
welfare_specs <- data.frame(
country = c("COL", "COL", "MEX", "MEX", "PRY", "PRY"),
year = c(2010, 2012, 2010, 2015, 2011, 2014),
stringsAsFactors = FALSE
)
mk_reporting <- function(cty) {
if (cty %in% c("COL", "MEX")) c("urban", "rural") else c("national")
}
simulate_welfare <- function(country, year) {
rl <- mk_reporting(country)
# 200 households per reporting level
dt_list <- lapply(rl, function(rlev) {
n <- 200L
data.table(
country = country,
year = year,
reporting_level = rlev,
hh_id = sprintf("%s_%s_%s_%03d", country, year, rlev, seq_len(n)),
welfare = round(
rlnorm(n, meanlog = log(1000 + year) - 7, sdlog = 0.5),
2
),
weight = runif(n, 0.5, 2)
)
})
rbindlist(dt_list)
}
welfare_paths <- list()
for (i in seq_len(nrow(welfare_specs))) {
row <- welfare_specs[i, ]
dt <- simulate_welfare(row$country, row$year)
fn <- fs::path(welfare_dir, sprintf("%s_%d.qs2", row$country, row$year))
# Primary key columns include household id for uniqueness
st_save(
dt,
fn,
pk = c("country", "year", "reporting_level", "hh_id"),
domain = "welfare"
)
welfare_paths[[length(welfare_paths) + 1]] <- fn
}
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2 @ version
#> 75ad53d22bdc889c
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2 @ version
#> 22a631e9cf4b654b
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2 @ version
#> 8cd1d3757fd8861e
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2 @ version
#> 5ae25a0f064c4022
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2 @ version
#> 1661f2a8e0925673
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2 @ version
#> 8515270d66067cfc
unlist(welfare_paths)
#> [1] "/tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2"
#> [2] "/tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2"
#> [3] "/tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2"
#> [4] "/tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2"
#> [5] "/tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2"
#> [6] "/tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2"Each call to st_save() assigns a version id (if content
differs) and writes sidecar metadata including PK.
Inspect one artifact:
st_info(fs::path(welfare_dir, "COL_2010.qs2"))
#> $sidecar
#> $sidecar$path
#> [1] "/tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2"
#>
#> $sidecar$format
#> [1] "qs2"
#>
#> $sidecar$created_at
#> [1] "2025-12-22T11:01:26.323109Z"
#>
#> $sidecar$size_bytes
#> [1] 4803
#>
#> $sidecar$content_hash
#> [1] "883dbf2745b89d63"
#>
#> $sidecar$code_hash
#> NULL
#>
#> $sidecar$file_hash
#> NULL
#>
#> $sidecar$code_label
#> NULL
#>
#> $sidecar$parents
#> list()
#>
#> $sidecar$attrs
#> list()
#>
#> $sidecar$pk
#> $sidecar$pk$keys
#> [1] "country" "year" "reporting_level" "hh_id"
#>
#>
#> $sidecar$domain
#> [1] "welfare"
#>
#>
#> $catalog
#> $catalog$latest_version_id
#> [1] "75ad53d22bdc889c"
#>
#> $catalog$n_versions
#> [1] 1
#>
#>
#> $snapshot_dir
#> /tmp/RtmpSn2PpI/s/.stamp/versions/data/welfare/COL_2010.qs2/75ad53d22bdc889c
#>
#> $parents
#> list()2. Simulate and Save Macro Data (CPI, GDP, Population)
CPI & population are reporting-level granular; GDP is country-year only.
fs::dir_create(macro_dir)
years <- 2010:2015
countries_macro <- c("COL", "MEX", "PRY", "BRA", "ARG")
levels_all <- c("national", "urban", "rural")
simulate_cpi <- function() {
out <- CJ(
country = countries_macro,
year = years,
reporting_level = levels_all
)
out[, cpi := round(runif(.N, 80, 140), 2)]
out
}
simulate_population <- function() {
out <- CJ(
country = countries_macro,
year = years,
reporting_level = levels_all
)
out[, population := round(runif(.N, 5e5, 20e6))]
out
}
simulate_gdp <- function() {
out <- CJ(country = countries_macro, year = years)
out[, gdp := round(runif(.N, 1e9, 5e11))]
out
}
cpi <- simulate_cpi()
pop <- simulate_population()
gdp <- simulate_gdp()
st_save(
cpi,
fs::path(macro_dir, "cpi.qs2"),
pk = c("country", "year", "reporting_level"),
domain = "macro"
)
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 @ version 75a94b22ddbf0ca2
st_save(
pop,
fs::path(macro_dir, "population.qs2"),
pk = c("country", "year", "reporting_level"),
domain = "macro"
)
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/macro/population.qs2 @ version
#> 5c212fac3a83dc1f
st_save(
gdp,
fs::path(macro_dir, "gdp.qs2"),
pk = c("country", "year"),
domain = "macro"
)
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2 @ version 0f3eb09ce38cedc6Version listing for CPI:
st_versions(fs::path(macro_dir, "cpi.qs2"))3. Aggregation Function foo() and Output Table
We compute per (country, year, reporting_level) welfare
statistics and merge with CPI, GDP, population. We demonstrate loading
inputs inside the function for self-containment.
foo <- function() {
# Load all welfare micro artifacts (only recognized data files)
welfare_files <- data_files(welfare_dir)
wf <- rbindlist(lapply(welfare_files, st_load))
stats <- wf[,
.(
welfare_mean = weighted.mean(welfare, weight),
welfare_median = as.numeric(stats::median(welfare)),
welfare_sd = stats::sd(welfare),
welfare_n = .N
),
by = .(country, year, reporting_level)
]
cpi <- st_load(fs::path(macro_dir, "cpi.qs2"))
gdp <- st_load(fs::path(macro_dir, "gdp.qs2"))
pop <- st_load(fs::path(macro_dir, "population.qs2"))
# Merge; CPI & population contain rows not present in welfare; keep welfare rows only
out <- merge(
stats,
cpi,
by = c("country", "year", "reporting_level"),
all.x = TRUE
)
out <- merge(
out,
pop,
by = c("country", "year", "reporting_level"),
all.x = TRUE
)
out <- merge(out, gdp, by = c("country", "year"), all.x = TRUE)
setcolorder(
out,
c(
"country",
"year",
"reporting_level",
"welfare_mean",
"welfare_median",
"welfare_sd",
"welfare_n",
"cpi",
"gdp",
"population"
)
)
out
}
summary_table <- foo()
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/population.qs2
head(summary_table)
#> Key: <country, year>
#> country year reporting_level welfare_mean welfare_median welfare_sd
#> <char> <int> <char> <num> <num> <num>
#> 1: COL 2010 rural 3.188479 2.895 1.604006
#> 2: COL 2010 urban 3.065242 2.665 1.681036
#> 3: COL 2012 rural 3.196254 2.820 1.872963
#> 4: COL 2012 urban 3.079524 2.750 1.627238
#> 5: MEX 2010 rural 3.247370 2.775 1.696690
#> 6: MEX 2010 urban 3.006862 2.710 1.606477
#> welfare_n cpi gdp population
#> <int> <num> <num> <num>
#> 1: 200 92.71 449669674645 10746038
#> 2: 200 99.10 449669674645 5152778
#> 3: 200 93.11 55701684731 7761159
#> 4: 200 111.42 55701684731 15249755
#> 5: 200 93.68 384563952393 5367608
#> 6: 200 125.54 384563952393 12591517Save the summary with lineage: parents include all welfare micro datasets plus CPI/GDP/population.
parent_paths <- c(
data_files(welfare_dir),
fs::path(macro_dir, "cpi.qs2"),
fs::path(macro_dir, "gdp.qs2"),
fs::path(macro_dir, "population.qs2")
)
parents <- lapply(parent_paths, function(p) {
list(path = p, version_id = st_latest(p))
})
out_summary_path <- fs::path(outputs_dir, "welfare_summary.qs2")
st_save(
summary_table,
out_summary_path,
pk = c("country", "year", "reporting_level"),
parents = parents,
domain = "summary"
)
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 @ version
#> 2b148d64bdefae4a
st_lineage(out_summary_path, depth = 1)
#> level child_path child_version
#> 1 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 2 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 3 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 4 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 5 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 6 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 7 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 8 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> 9 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 2b148d64bdefae4a
#> parent_path parent_version
#> 1 /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2 75ad53d22bdc889c
#> 2 /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2 22a631e9cf4b654b
#> 3 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2 8cd1d3757fd8861e
#> 4 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2 5ae25a0f064c4022
#> 5 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2 1661f2a8e0925673
#> 6 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2 8515270d66067cfc
#> 7 /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 75a94b22ddbf0ca2
#> 8 /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2 0f3eb09ce38cedc6
#> 9 /tmp/RtmpSn2PpI/s/data/macro/population.qs2 5c212fac3a83dc1fNote: to make provenance explicit, pass the producing function as
code= so st_save() records the function hash.
Below we demonstrate both patterns: foo() loads its own
data, while bar() accepts pre-loaded objects and is saved
with code = bar.
# Variant that accepts loaded objects (preferred for testability and clarity)
bar <- function(welfare_list, cpi_tbl, gdp_tbl, pop_tbl) {
wf <- data.table::rbindlist(welfare_list)
stats <- wf[,
.(
welfare_mean = weighted.mean(welfare, weight),
welfare_median = as.numeric(stats::median(welfare)),
welfare_sd = stats::sd(welfare),
welfare_n = .N
),
by = .(country, year, reporting_level)
]
out <- merge(
stats,
cpi_tbl,
by = c("country", "year", "reporting_level"),
all.x = TRUE
)
out <- merge(
out,
pop_tbl,
by = c("country", "year", "reporting_level"),
all.x = TRUE
)
out <- merge(out, gdp_tbl, by = c("country", "year"), all.x = TRUE)
setcolorder(
out,
c(
"country",
"year",
"reporting_level",
"welfare_mean",
"welfare_median",
"welfare_sd",
"welfare_n",
"cpi",
"gdp",
"population"
)
)
out
}
# Example: load inputs outside the worker and call bar()
welfare_files <- data_files(welfare_dir)
welfare_list <- lapply(welfare_files, st_load)
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2
cpi_tbl <- st_load(fs::path(macro_dir, "cpi.qs2"))
#> ✔ Loaded [qs2] ←
#> /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2
gdp_tbl <- st_load(fs::path(macro_dir, "gdp.qs2"))
#> ✔ Loaded [qs2] ←
#> /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2
pop_tbl <- st_load(fs::path(macro_dir, "population.qs2"))
#> ✔ Loaded [qs2] ←
#> /tmp/RtmpSn2PpI/s/data/macro/population.qs2
summary_table2 <- bar(welfare_list, cpi_tbl, gdp_tbl, pop_tbl)
st_save(
summary_table2,
out_summary_path,
pk = c("country", "year", "reporting_level"),
parents = parents,
code = bar, # addition the function as parent so st_save() can track it
domain = "summary"
)
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 @ version
#> 774500cf09b41ae5
st_lineage(out_summary_path, depth = 1)
#> level child_path child_version
#> 1 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 2 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 3 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 4 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 5 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 6 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 7 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 8 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> 9 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 774500cf09b41ae5
#> parent_path parent_version
#> 1 /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2 75ad53d22bdc889c
#> 2 /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2 22a631e9cf4b654b
#> 3 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2 8cd1d3757fd8861e
#> 4 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2 5ae25a0f064c4022
#> 5 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2 1661f2a8e0925673
#> 6 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2 8515270d66067cfc
#> 7 /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 75a94b22ddbf0ca2
#> 8 /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2 0f3eb09ce38cedc6
#> 9 /tmp/RtmpSn2PpI/s/data/macro/population.qs2 5c212fac3a83dc1fHandling multiple function dependencies
If the output depends on more than one function (for example
foo() calls f1() and f2()), you
can provide a combined code metadata to
st_save() that captures all producing functions.
st_save() accepts any object for code; best
practice is to pass a short list of the functions involved,
e.g. code = list(foo = foo, helper1 = f1, helper2 = f2).
The package hashes code and records a
code_hash in the sidecar so downstream
st_is_stale() can detect changes in any contributor.
Example pattern:
# Provide multiple functions to st_save so code_hash changes if any of them change
# st_save(x, path, code = list(foo = foo, helper1 = f1, helper2 = f2))4. Modify CPI (COL 2012) → New Version & Staleness
We change CPI for COL 2012 (all reporting levels) to illustrate a new version and stale downstream artifact.
cpi2 <- st_load(fs::path(macro_dir, "cpi.qs2"))
#> ✔ Loaded [qs2] ←
#> /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2
cpi2[country == "COL" & year == 2012, cpi := cpi * 1.05] # 5% adjustment
st_save(cpi2, fs::path(macro_dir, "cpi.qs2")) # new version recorded
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 @ version 7d301454602ecbd6
st_versions(fs::path(macro_dir, "cpi.qs2"))[1:3]
#> version_id artifact_id content_hash code_hash size_bytes
#> <char> <char> <char> <char> <num>
#> 1: 7d301454602ecbd6 2f3e54a46f3de65a 7ecab5cc3c1c678f <NA> 890
#> 2: 75a94b22ddbf0ca2 2f3e54a46f3de65a a0a1d1f0acc4fb3a <NA> 874
#> 3: <NA> <NA> <NA> <NA> NA
#> created_at sidecar_format
#> <char> <char>
#> 1: 2025-12-22T11:01:27.979066Z json
#> 2: 2025-12-22T11:01:26.888534Z json
#> 3: <NA> <NA>
st_is_stale(out_summary_path) # should be TRUE
#> [1] TRUEBuilders, plans and rebuilds — concepts and how we use them here
A short glossary and explanation of the concepts used in the examples above:
Artifact: a saved object on disk (one path managed by stamp).
Sidecar: metadata file beside each artifact (PKs, parents, code hash, domain, timestamps).
Version: immutable snapshot; new version id recorded when
st_save()detects changed content.Parent: pointer (path + version_id) to an upstream artifact used to produce a downstream artifact.
Lineage: directed graph linking artifacts via parent relationships; inspect with
st_lineage().Builder: function registered with
st_register_builder(path, builder_fn)that knows how to produce one artifact. Returns at leastx(object to save) andcode(function(s) used) so code changes trigger staleness.Plan: result of
st_plan_rebuild()describing which targets are stale (parents changed, code changed, or artifact missing) and thus need rebuilding.Rebuild: executing a plan via
st_rebuild(plan); runs builders, saves artifacts, updates sidecars/versions.
General idea - Register builders for reproducible targets. Builders
should be deterministic and return the object to save (do not save
directly inside the builder). - Use st_plan_rebuild() to
compute what needs to be rebuilt (it inspects recorded parents and code
hashes). - Inspect the plan, then call st_rebuild() to
execute builders and record new versions.
How this vignette implements those ideas - We register a builder for
out_summary_path (see the
st_register_builder() call below). That builder returns
list(x = foo(), code = foo, code_label = "aggregate_welfare").
- st_plan_rebuild() compares current parent version ids and
the code hash of foo against what the sidecar recorded; if
any parent or code changed the plan will mark the target as stale. -
st_rebuild(plan) calls the registered builder, then
st_save() persistently records the new artifact and sidecar
(including the new code_hash and parent pointers). - This
flow gives you a lightweight, auditable, and incremental pipeline:
change an upstream artifact or a function and the plan will tell you
what to rebuild.
Practical tips and a small example - Builders should prefer to use
the parents argument passed in (this avoids global state
and makes builders easier to test). - If your target depends on multiple
functions, pass them all in code so st_save()
records a combined code hash:
code = list(foo = foo, helper = f1).
Example builder that loads parents explicitly and returns multiple
functions in code:
# Example: register a builder that uses the provided `parents` list and avoids global loads
st_register_builder(out_summary_path, function(path, parents) {
# parents is a list of lists: each element has $path and $version_id
# Filter parent paths to identify welfare partitions vs macros, then load them
welfare_parent_paths <- vapply(
parents,
function(p) grepl("welfare", p$path),
logical(1)
)
welfare_list <- lapply(parents[welfare_parent_paths], function(p) {
st_load(p$path)
})
cpi_tbl <- st_load(fs::path(macro_dir, "cpi.qs2"))
gdp_tbl <- st_load(fs::path(macro_dir, "gdp.qs2"))
pop_tbl <- st_load(fs::path(macro_dir, "population.qs2"))
# Produce the output using a helper function (bar) that accepts pre-loaded inputs
out <- bar(welfare_list, cpi_tbl, gdp_tbl, pop_tbl)
# Return the produced object plus code metadata (capture both producer and helpers)
list(
x = out,
code = list(bar = bar), # include helpers as needed
code_label = "aggregate_welfare_partitioned"
)
})
#> ✔ Registered builder for /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2
#> (default)Inspect the plan before running it:
plan <- st_plan_rebuild(
targets = out_summary_path,
include_targets = TRUE,
mode = "strict"
)
print(plan)
#> level path reason
#> 1 0 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 parent_changed
#> latest_version_before
#> 1 774500cf09b41ae5Run the plan to execute the builders and record new versions:
st_rebuild(plan)
#> ✔ Rebuild level 0: 1 artifact
#> • /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 (parent_changed)
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/population.qs2
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 @ version
#> 81e0cee6f20a21ea
#> OK @ version 81e0cee6f20a21ea
#> ✔ Rebuild summary
#> built 1This example shows the recommended pattern: load from
parents, call a pure worker (like bar()), and
return x and code. That ensures
st_rebuild() can uniformly save artifacts, record lineage,
and compute code_hash changes for future incremental
rebuilds.
Plan & rebuild using a registered builder for the summary artifact.
st_register_builder(out_summary_path, function(path, parents) {
list(x = foo(), code = foo, code_label = "aggregate_welfare")
})
#> ✔ Registered builder for /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2
#> (default)
plan <- st_plan_rebuild(
targets = out_summary_path,
include_targets = TRUE,
mode = "strict"
)
plan
#> [1] level path reason
#> [4] latest_version_before
#> <0 rows> (or 0-length row.names)
st_rebuild(plan)
#> ✔ Nothing to rebuild (empty plan).
st_lineage(out_summary_path, depth = 1)
#> level child_path child_version
#> 1 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 2 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 3 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 4 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 5 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 6 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 7 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 8 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> 9 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 81e0cee6f20a21ea
#> parent_path parent_version
#> 1 /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2 75ad53d22bdc889c
#> 2 /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2 22a631e9cf4b654b
#> 3 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2 8cd1d3757fd8861e
#> 4 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2 5ae25a0f064c4022
#> 5 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2 1661f2a8e0925673
#> 6 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2 8515270d66067cfc
#> 7 /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 7d301454602ecbd6
#> 8 /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2 0f3eb09ce38cedc6
#> 9 /tmp/RtmpSn2PpI/s/data/macro/population.qs2 5c212fac3a83dc1f5. Add New Data (ARG 2015 Welfare)
We introduce a new welfare dataset (ARG 2015) which appears in macro tables already. Only the summary rows for ARG 2015 will be newly added.
dt_arg_2015 <- simulate_welfare("ARG", 2015)
st_save(
dt_arg_2015,
fs::path(welfare_dir, "ARG_2015.qs2"),
pk = c("country", "year", "reporting_level", "hh_id"),
domain = "welfare"
)
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/data/welfare/ARG_2015.qs2 @ version
#> 16a8fd2c4d6bf395
parents2 <- lapply(
c(
data_files(welfare_dir),
fs::path(macro_dir, "cpi.qs2"),
fs::path(macro_dir, "gdp.qs2"),
fs::path(macro_dir, "population.qs2")
),
function(p) list(path = p, version_id = st_latest(p))
)
st_save(
foo(),
out_summary_path,
pk = c("country", "year", "reporting_level"),
parents = parents2,
domain = "summary"
)
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/ARG_2015.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2
#> ✔ Loaded [qs2] ← /tmp/RtmpSn2PpI/s/data/macro/population.qs2
#> ✔ Saved [qs2] → /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 @ version
#> 381caa9fa29d3e6d
head(st_load(out_summary_path))
#> ✔ Loaded [qs2] ←
#> /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2
#> country year reporting_level welfare_mean welfare_median welfare_sd
#> <char> <int> <char> <num> <num> <num>
#> 1: ARG 2015 national 3.100148 2.510 1.700340
#> 2: COL 2010 rural 3.188479 2.895 1.604006
#> 3: COL 2010 urban 3.065242 2.665 1.681036
#> 4: COL 2012 rural 3.196254 2.820 1.872963
#> 5: COL 2012 urban 3.079524 2.750 1.627238
#> 6: MEX 2010 rural 3.247370 2.775 1.696690
#> welfare_n cpi gdp population
#> <int> <num> <num> <num>
#> 1: 200 120.4800 250154797653 8785981
#> 2: 200 92.7100 449669674645 10746038
#> 3: 200 99.1000 449669674645 5152778
#> 4: 200 97.7655 55701684731 7761159
#> 5: 200 116.9910 55701684731 15249755
#> 6: 200 93.6800 384563952393 53676086. Partitioned Workflow for Partial Re-execution
Instead of storing one large welfare micro file per country-year, we
can store partitions per (country, year, reporting_level)
using st_save_part() and then recompute only the affected
partitions when an input changes.
fs::dir_create(welfare_parts_dir)
# Write partitioned welfare (one partition per country/year/reporting_level)
for (i in seq_len(nrow(welfare_specs))) {
row <- welfare_specs[i, ]
dt <- simulate_welfare(row$country, row$year) # new simulated (independent from earlier saves)
# split by reporting_level
for (rl in unique(dt$reporting_level)) {
part_dt <- dt[reporting_level == rl]
st_save_part(
part_dt,
base = welfare_parts_dir,
key = list(country = row$country, year = row$year, reporting_level = rl),
code_label = "welfare_partition",
pk = c("country", "year", "reporting_level", "hh_id")
)
}
}
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=urban/year=2010/part.qs2
#> @ version f6448f219d342016
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=rural/year=2010/part.qs2
#> @ version 959de31c2d1ffed4
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=urban/year=2012/part.qs2
#> @ version 38961d4886c4775f
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=rural/year=2012/part.qs2
#> @ version 5ad9186f14978b81
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=MEX/reporting_level=urban/year=2010/part.qs2
#> @ version a036807dece25bc1
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=MEX/reporting_level=rural/year=2010/part.qs2
#> @ version 14fb0c42cd5bf727
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=MEX/reporting_level=urban/year=2015/part.qs2
#> @ version 71f9821a8de8897e
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=MEX/reporting_level=rural/year=2015/part.qs2
#> @ version 05684c727b922034
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=PRY/reporting_level=national/year=2011/part.qs2
#> @ version a8972c58d02ab0cf
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/welfare_parts/country=PRY/reporting_level=national/year=2014/part.qs2
#> @ version b1b9a669afaa2f3c
st_list_parts(welfare_parts_dir)[1:6, ]
#> path
#> 1 /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=rural/year=2010/part.qs2
#> 2 /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=rural/year=2012/part.qs2
#> 3 /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=urban/year=2010/part.qs2
#> 4 /tmp/RtmpSn2PpI/s/data/welfare_parts/country=COL/reporting_level=urban/year=2012/part.qs2
#> 5 /tmp/RtmpSn2PpI/s/data/welfare_parts/country=MEX/reporting_level=rural/year=2010/part.qs2
#> 6 /tmp/RtmpSn2PpI/s/data/welfare_parts/country=MEX/reporting_level=rural/year=2015/part.qs2
#> country reporting_level year
#> 1 COL rural 2010
#> 2 COL rural 2012
#> 3 COL urban 2010
#> 4 COL urban 2012
#> 5 MEX rural 2010
#> 6 MEX rural 2015Define a partitioned builder writing one output partition per
(country, year, reporting_level) summary row. Each output
partition depends on: the corresponding welfare partition + CPI + GDP +
population (all three macro artifacts are shared parents).
fs::dir_create(summary_parts_dir)
partition_keys <- st_list_parts(welfare_parts_dir)
build_partition_summary <- function(path, parents) {
# path is the final destination for this partition artifact
# Extract key by parsing path (we stored key in directory names)
rel <- fs::path_rel(path, start = summary_parts_dir)
segs <- strsplit(dirname(rel), .Platform$file.sep)[[1]]
segs <- segs[nzchar(segs)]
# Extract only segments that follow the key=value pattern and ignore others
kv_pairs <- lapply(segs, function(s) {
m <- regmatches(s, regexec("^([^=]+)=(.*)$", s))[[1]]
if (length(m) == 3L) list(k = m[2], v = m[3]) else NULL
})
kv_pairs <- Filter(Negate(is.null), kv_pairs)
if (!length(kv_pairs)) {
key <- list()
} else {
key <- setNames(
lapply(kv_pairs, function(x) x$v),
vapply(kv_pairs, function(x) x$k, character(1))
)
}
# Load welfare partition matching key
w_path <- st_part_path(welfare_parts_dir, key, format = NULL)
w <- st_load(w_path)
stats <- w[,
.(
welfare_mean = weighted.mean(welfare, weight),
welfare_median = as.numeric(median(welfare)),
welfare_sd = sd(welfare),
welfare_n = .N
),
by = .(country, year, reporting_level)
]
# Load macro (shared) and subset to key
cpi <- st_load(fs::path(macro_dir, "cpi.qs2"))[
country == key$country &
year == as.integer(key$year) &
reporting_level == key$reporting_level
]
pop <- st_load(fs::path(macro_dir, "population.qs2"))[
country == key$country &
year == as.integer(key$year) &
reporting_level == key$reporting_level
]
gdp <- st_load(fs::path(macro_dir, "gdp.qs2"))[
country == key$country & year == as.integer(key$year)
]
out <- merge(
stats,
cpi,
by = c("country", "year", "reporting_level"),
all.x = TRUE
)
out <- merge(
out,
pop,
by = c("country", "year", "reporting_level"),
all.x = TRUE
)
out <- merge(out, gdp, by = c("country", "year"), all.x = TRUE)
list(
x = out,
code = build_partition_summary,
code_label = "partition_summary"
)
}
# List a few partitioned summary partitions
st_list_parts(summary_parts_dir)[1:6, ]
#> [1] NA NA NA NA NA NAPartial Re-execution After CPI Change
If CPI changes only for COL 2012, we want to rebuild only partitions for COL 2012 across reporting levels.
# Modify CPI again (COL 2012) to trigger staleness
cpi3 <- st_load(fs::path(macro_dir, "cpi.qs2"))
#> ✔ Loaded [qs2] ←
#> /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2
cpi3[country == "COL" & year == 2012, cpi := cpi * 1.02]
st_save(cpi3, fs::path(macro_dir, "cpi.qs2"))
#> ✔ Saved [qs2] →
#> /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 @ version d1ff1c23e994508e
# Detect which partition outputs are stale (simple check: those whose parent CPI version differs)
summary_parts <- st_list_parts(summary_parts_dir)
stale_idx <- vapply(summary_parts$path, st_is_stale, logical(1))
summary_parts[stale_idx, ]
#> character(0)Rebuild only stale partitions (manually filtered):
stale_paths <- summary_parts$path[stale_idx]
for (p in stale_paths) {
# reconstruct key from path to identify its welfare partition parent
rel <- fs::path_rel(p, start = summary_parts_dir)
segs <- strsplit(dirname(rel), .Platform$file.sep)[[1]]
kv <- lapply(segs, function(s) strsplit(s, "=")[[1]])
key <- setNames(lapply(kv, function(x) x[2]), lapply(kv, function(x) x[1]))
parents <- list(
list(
path = st_part_path(welfare_parts_dir, key),
version_id = st_latest(st_part_path(welfare_parts_dir, key))
),
list(
path = fs::path(macro_dir, "cpi.qs2"),
version_id = st_latest(fs::path(macro_dir, "cpi.qs2"))
),
list(
path = fs::path(macro_dir, "gdp.qs2"),
version_id = st_latest(fs::path(macro_dir, "gdp.qs2"))
),
list(
path = fs::path(macro_dir, "population.qs2"),
version_id = st_latest(fs::path(macro_dir, "population.qs2"))
)
)
b <- build_partition_summary(p, parents)
st_save(
b$x,
p,
parents = parents,
pk = c("country", "year", "reporting_level"),
code = b$code,
code_label = b$code_label
)
}Only COL 2012 partitions were recomputed, avoiding unnecessary work for other countries/years.
7. Summary
We demonstrated:
- Saving multiple artifacts with PK metadata and automatic versioning.
- Building a derived summary with explicit lineage and rebuilding after an upstream change.
- Adding new data (ARG 2015) and incorporating it with minimal friction.
- A partitioned strategy enabling targeted re-execution for changed inputs.
Explore lineage further:
st_lineage(out_summary_path, depth = 2)[1:10, ]
#> level child_path child_version
#> 1 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 2 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 3 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 4 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 5 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 6 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 7 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 8 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 9 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> 10 1 /tmp/RtmpSn2PpI/s/outputs/welfare_summary.qs2 381caa9fa29d3e6d
#> parent_path parent_version
#> 1 /tmp/RtmpSn2PpI/s/data/welfare/ARG_2015.qs2 16a8fd2c4d6bf395
#> 2 /tmp/RtmpSn2PpI/s/data/welfare/COL_2010.qs2 75ad53d22bdc889c
#> 3 /tmp/RtmpSn2PpI/s/data/welfare/COL_2012.qs2 22a631e9cf4b654b
#> 4 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2010.qs2 8cd1d3757fd8861e
#> 5 /tmp/RtmpSn2PpI/s/data/welfare/MEX_2015.qs2 5ae25a0f064c4022
#> 6 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2011.qs2 1661f2a8e0925673
#> 7 /tmp/RtmpSn2PpI/s/data/welfare/PRY_2014.qs2 8515270d66067cfc
#> 8 /tmp/RtmpSn2PpI/s/data/macro/cpi.qs2 7d301454602ecbd6
#> 9 /tmp/RtmpSn2PpI/s/data/macro/gdp.qs2 0f3eb09ce38cedc6
#> 10 /tmp/RtmpSn2PpI/s/data/macro/population.qs2 5c212fac3a83dc1f
# Remove the temporary root used for vignette examples. Only run in a local
# interactive session when you are sure you no longer need the temporary files.
if (exists("root_dir") && fs::dir_exists(root_dir)) {
# fs::dir_delete() permanently removes the directory tree.
fs::dir_delete(root_dir)
}