Skip to contents
# Use development build when interactive *and* explicitly enabled via env var.
dev_mode <- (Sys.getenv("DEV_VIGNETTES", "false") == "true")

if (dev_mode && requireNamespace("pkgload", quietly = TRUE)) {
  pkgload::load_all(
    export_all = FALSE,
    helpers = FALSE,
    attach_testthat = FALSE
  )
} else {
  # fall back to the installed package (the path CRAN, CI, and pkgdown take)
  library(stamp)
}

This vignette shows how to keep your versions store lean and your tables self-describing:

  • Retention & pruning: keep only the versions you need, either automatically after each save or on demand.
  • Table metadata: record primary keys (PKs) and sidecar metadata so downstream code understands the grain and integrity of each table.
  • Partitioned datasets: read/write tidy “Hive-style” layouts (e.g., country=PER/year=2023/part.qs2) and bind them efficiently.

Key APIs:

Policy syntax (for retain_versions or st_prune_versions(policy=...)):

  • Inf (default): keep everything.
  • Integer n: keep the n latest versions per artifact.
  • List union: list(n = 5, days = 14) keeps the 5 latest or anything from the last 14 days (latest is always protected).

Implementation note: we recommend your package initializes retain_versions = Inf via st_opts() defaults. If you wire .st_apply_retention() at the end of st_save(), retention will be enforced automatically after each write. Otherwise, use st_prune_versions() explicitly.


Minimal project scaffold (temp)

root <- fs::path(tempdir(), "stamp-retention-example")
if (fs::dir_exists(root)) {
  fs::dir_delete(root)
}
st_init(root)
##  stamp initialized
##   alias: default
##   root: /tmp/Rtmp3nPsIC/stamp-retention-example
##   state: /tmp/Rtmp3nPsIC/stamp-retention-example/.stamp

We’ll create a few artifacts and multiple versions to demonstrate pruning:

pA <- "A.qs"
pB <- "B.qs"
pC <- "C.qs"

xA1 <- data.frame(a = 1:3)
xA2 <- data.frame(a = 2:4)
xA3 <- data.frame(a = 3:5)

xB1 <- data.frame(b = letters[1:3])
xB2 <- data.frame(b = letters[2:4])

xC1 <- data.frame(c = 10:12)

# Keep retention OFF initially so we can create multiple versions
st_opts(retain_versions = Inf)
##  stamp options updated
##   retain_versions = "Inf"
st_save(xA1, pA, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/A.qs @ version
##   45b8280be2ccc790
st_save(xA2, pA, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/A.qs @ version
##   c50cc77244e5f241
st_save(xA3, pA, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/A.qs @ version
##   d880595449068187
st_save(xB1, pB, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/B.qs @ version
##   6a33c1506fae6895
st_save(xB2, pB, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/B.qs @ version
##   4ebfb3fb06177a93
st_save(xC1, pC, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/C.qs @ version
##   061d9197f737b7d1

Inspect store & catalog:

# Show version directories for each artifact (now stored per-artifact)
# Versions are stored in <artifact_folder>/versions/<version_id>/
info_a <- st_info(pA, alias = NULL)
artifact_dir_a <- fs::path_dir(info_a$sidecar$path)
versions_dir_a <- fs::path(artifact_dir_a, "versions")

if (fs::dir_exists(versions_dir_a)) {
  cat("A.qs versions:\n")
  fs::dir_tree(versions_dir_a, recurse = 1)
}

print(st_versions(pA, alias = NULL))
##          version_id      artifact_id     content_hash        code_hash
##              <char>           <char>           <char>           <char>
## 1: d880595449068187 a55218221e01eab2 b3c012fc5cb6cfd5 488e8fa49c740261
## 2: c50cc77244e5f241 a55218221e01eab2 2fa93012845f84ac 488e8fa49c740261
## 3: 45b8280be2ccc790 a55218221e01eab2 913f5bf52f2c0263 488e8fa49c740261
##    size_bytes                  created_at sidecar_format
##         <num>                      <char>         <char>
## 1:        243 2026-05-22T14:26:03.014530Z           json
## 2:        243 2026-05-22T14:26:02.968149Z           json
## 3:        243 2026-05-22T14:26:02.910279Z           json
print(st_versions(pB, alias = NULL))
##          version_id      artifact_id     content_hash        code_hash
##              <char>           <char>           <char>           <char>
## 1: 4ebfb3fb06177a93 1282351508383425 eecea3a9080f6878 488e8fa49c740261
## 2: 6a33c1506fae6895 1282351508383425 241f6ccd5b268648 488e8fa49c740261
##    size_bytes                  created_at sidecar_format
##         <num>                      <char>         <char>
## 1:        199 2026-05-22T14:26:03.085866Z           json
## 2:        202 2026-05-22T14:26:03.048031Z           json
print(st_versions(pC, alias = NULL))
##          version_id      artifact_id     content_hash        code_hash
##              <char>           <char>           <char>           <char>
## 1: 061d9197f737b7d1 f3087ae60cdc7820 c3b7f066d3ee5e84 488e8fa49c740261
##    size_bytes                  created_at sidecar_format
##         <num>                      <char>         <char>
## 1:        243 2026-05-22T14:26:03.119914Z           json

Ad-hoc pruning (explicit runs)

Use this when you want full control (e.g., pre-release cleanup, occasional housekeeping, or when you don’t wire auto-retention into st_save()).

Keep the n latest for a specific artifact

# Dry run (safe preview)
st_prune_versions(path = pA, policy = 2, dry_run = TRUE, alias = NULL)
##  DRY RUN: 1 version would be pruned across 1 artifact.
##   Estimated space reclaimed: ~243 bytes
# Apply pruning
repA <- st_prune_versions(path = pA, policy = 2, dry_run = FALSE, alias = NULL)
repA
##         artifact_id                                artifact_path
##              <char>                                       <char>
## 1: a55218221e01eab2 /tmp/Rtmp3nPsIC/stamp-retention-example/A.qs
##          version_id                  created_at size_bytes
##              <char>                      <char>      <num>
## 1: 45b8280be2ccc790 2026-05-22T14:26:02.910279Z        243
nrow(st_versions(pA, alias = NULL)) # <= 2; latest always protected
## [1] 2

Practical tip: always run the dry_run = TRUE preview and inspect repA before calling with dry_run = FALSE. The returned table indicates which snapshots would be removed and allows you to store that plan in CI logs for audit.

Keep by recency window across the entire catalog

# Keep anything from the last 14 days; preview first
st_prune_versions(policy = list(days = 14), dry_run = TRUE, alias = NULL)
##  DRY RUN: 1 version would be pruned across 1 artifact.
##   Estimated space reclaimed: ~243 bytes
# Apply
repAll <- st_prune_versions(policy = list(days = 14), alias = NULL)
##  DRY RUN: 1 version would be pruned across 1 artifact.
##   Estimated space reclaimed: ~243 bytes
head(repAll)
##         artifact_id                                artifact_path
##              <char>                                       <char>
## 1: a55218221e01eab2 /tmp/Rtmp3nPsIC/stamp-retention-example/A.qs
##          version_id                  created_at size_bytes
##              <char>                      <char>      <num>
## 1: c50cc77244e5f241 2026-05-22T14:26:02.968149Z        243

Combine count + recency (union semantics)

# Keep last 2 versions OR any version created within 7 days
st_prune_versions(policy = list(n = 2, days = 7), alias = NULL)
##  Retention policy matched zero versions; nothing to prune.

What the report returns

A data frame with artifact_path, version_id, created_at, and action (keep/delete). Use dry_run = TRUE to log/approve a plan before destructive actions (recommended on shared infra/CI).

Edge cases & guarantees

  • Latest version is always protected (even if it falls outside your policy window).
  • Time windows are computed from each version’s created_at in the catalog.
  • Pruning is idempotent: re-running with the same policy won’t remove more once the policy is satisfied.

Automatic pruning (on every st_save())

If you call an internal .st_apply_retention() at the end of st_save(), you can choose a global policy and stop thinking about it:

# Keep only the latest 2 versions per artifact going forward
st_opts(retain_versions = 2)
##  stamp options updated
##   retain_versions = "2"
# New save writes + immediate prune
xA4 <- data.frame(a = 4:6)
st_save(xA4, pA, code = function(z) z, alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/A.qs @ version
##   a850eeb4bf4f7d32
nrow(st_versions(pA, alias = NULL)) # <= 2
## [1] 2

If you enable automatic pruning via st_opts(retain_versions = <policy>), .st_apply_retention() will be invoked after each st_save() and prune according to the policy for the just-written artifact. This keeps your versions directory compact without extra housekeeping steps.

Be cautious when enabling aggressive retention (small n or short days) in shared or collaborative projects: set dry_run = TRUE in CI or have an approval step before destructive pruning.

Disable auto-pruning:

st_opts(retain_versions = Inf) # keep all versions
##  stamp options updated
##   retain_versions = "Inf"

Where to set this For project-wide behavior, set options(stamp.retain_versions = ...) in your project .Rprofile. At runtime, st_opts(retain_versions = ...) takes precedence.


Primary-key metadata & load-time checks

Milestone 4 adds PK metadata so each table carries its identity. This improves join safety, reproducibility, and downstream tooling (e.g., merge validation).

pop <- data.frame(
  country = c("PER", "PER", "COL"),
  year = c(2023, 2024, 2023),
  reporting_level = c("national", "urban", "national"),
  pop = c(34e6, 12e6, 52e6)
)

p_pop <- "inputs/population.qs"

# Validates uniqueness by default and writes PK to sidecar
# Note: `st_save(..., pk = ...)` validates the keys against the provided data
# and persists the `pk` element into the artifact's sidecar (stmeta/).
st_save(pop, p_pop, pk = c("country", "year", "reporting_level"), alias = NULL)
##  Saved [qs2] → /tmp/Rtmp3nPsIC/stamp-retention-example/inputs/population.qs @
##   version deb6b6ea9cbc0bb6

Effects

  • Validates that PK columns exist and are unique (unless you disable uniqueness).
  • Persists pk into the artifact sidecar.
  • Attaches attr(x, "stamp_pk") in memory on subsequent st_load().

Inspect / repair PK later

st_inspect_pk(p_pop) # read PK from sidecar
## [1] "country"         "year"            "reporting_level"
# If an older artifact lacks PK or you need to repair metadata, use:
st_add_pk(p_pop, keys = c("country", "year", "reporting_level"))
##  stamp options updated
##   require_pk_on_load = "FALSE"
##  Loaded [qs2] ← /tmp/Rtmp3nPsIC/stamp-retention-example/inputs/population.qs
##  Recorded primary key for inputs/population.qs --> country, year,
##   reporting_level
##  stamp options updated
##   require_pk_on_load = "FALSE"

Load-time behavior & options

obj <- st_load(p_pop, alias = NULL)
##  Loaded [qs2] ←
## /tmp/Rtmp3nPsIC/stamp-retention-example/inputs/population.qs
attr(obj, "stamp_pk") # keys attached on load
## $keys
## [1] "country"         "year"            "reporting_level"

Missing PK policy:

st_opts("require_pk_on_load", .get = TRUE) # default FALSE
## [1] FALSE
st_opts("warn_missing_pk_on_load", .get = TRUE) # default TRUE
## [1] TRUE
# CI: make PK presence a hard requirement
st_opts(require_pk_on_load = TRUE, warn_missing_pk_on_load = FALSE)
##  stamp options updated
##   require_pk_on_load = "TRUE", warn_missing_pk_on_load = "FALSE"

Why PKs matter (joins & merges)

pop <- data.frame(
  country = c("PER", "MEX"),
  year = c(2023, 2022),
  pop = c(34.5, 126.7)
)
pop <- st_with_pk(pop, c("country", "year"))

gdp <- data.frame(
  country = c("PER", "MEX"),
  year = c(2023, 2022),
  gdp = c(0.27, 1.26)
)
gdp <- st_with_pk(gdp, c("country", "year"))

merged <- merge(pop, gdp, by = c("country", "year"))
attr(merged, "stamp_pk") <- list(keys = c("country", "year")) # preserve grain
merged
##   country year   pop  gdp
## 1     MEX 2022 126.7 1.26
## 2     PER 2023  34.5 0.27

Catalog corruption & safety

On rare occasions a catalog file can become unreadable (disk issues, manual edit, or process crash). The package is conservative:

  • .st_catalog_read() will error if the catalog cannot be parsed. You can recover by removing the corrupted catalog file (it lives under <root>/<state_dir>/catalog.qs2) and re-running st_save(); the catalog will be recreated from the remaining snapshot directories.
  • Always back up the catalog.qs2 (and artifact versions/ directories) before running destructive operations in production.

Example recovery steps (manual):

  1. Move the corrupted catalog: mv <state>/catalog.qs2 <state>/catalog.qs2.bak
  2. Re-run st_save() on a representative artifact to recreate the catalog.

These steps are safe because pruning and catalog operations never touch the live artifact files — only the committed snapshots and the catalog.

Tips

  • Include all identifier columns in the PK (country, year, reporting_level, …). Measures are not part of the PK.
  • Prefer recording PKs at write time. Post-hoc repairs are supported but easier to forget.

Sidecar metadata (quick reference)

Every artifact has a JSON sidecar under sibling stmeta/ with:

  • Core: format, created_at, size_bytes
  • Integrity: content_hash, code_hash, optional file_hash
  • Lineage: parents, code_label
  • Tabular: pk, optional domain

Inspect:

side <- st_read_sidecar(p_pop)
names(side)
##  [1] "path"         "format"       "created_at"   "size_bytes"   "content_hash"
##  [6] "code_hash"    "file_hash"    "code_label"   "parents"      "attrs"       
## [11] "pk"
side$pk
## $keys
## [1] "country"         "year"            "reporting_level"

Integrity checks on load (if enabled via st_opts(verify_on_load = TRUE)):

  • Warn on mismatched content_hash (object changed).
  • Warn on mismatched file_hash (file bytes changed), when recorded.

Partitioned datasets (Hive-style)

When you need one file per key combo (e.g., per country/year), use the partition helpers. Layout:

<base>/<k1>=<v1>/<k2>=<v2>/part.<ext>

Create partitions & save parts

base <- "inputs/country_year"

# Paths (order of keys doesn't matter; normalized internally)
p_per_2023 <- st_part_path(
  base,
  key = list(country = "PER", year = 2023),
  format = "qs2"
)
p_mex_2022 <- st_part_path(base, key = list(country = "MEX", year = 2022))

per_tbl <- data.frame(country = "PER", year = 2023, pop = 34.5)
mex_tbl <- data.frame(country = "MEX", year = 2022, pop = 126.7)

# Save; PK recorded in each partition's sidecar
st_save_part(
  per_tbl,
  base,
  key = list(country = "PER", year = 2023),
  pk = c("country", "year"),
  alias = NULL
)
##  Saved [qs2] →
##   /tmp/Rtmp3nPsIC/stamp-retention-example/inputs/country_year/country=PER/year=2023/part.qs2
##   @ version 1b81971d08157d56
st_save_part(
  mex_tbl,
  base,
  key = list(country = "MEX", year = 2022),
  pk = c("country", "year"),
  alias = NULL
)
##  Saved [qs2] →
##   /tmp/Rtmp3nPsIC/stamp-retention-example/inputs/country_year/country=MEX/year=2022/part.qs2
##   @ version 22b76a569a2b66d3

st_save_part() uses st_save() under the hood and writes sidecars in a local stmeta/ directory inside each partition.

Discover & load

# List artifacts (sidecars under stmeta/ are ignored)
st_list_parts(base)
## [1] path
## <0 rows> (or 0-length row.names)
st_list_parts(base, filter = list(country = "PER"))
## [1] path
## <0 rows> (or 0-length row.names)
# Bind rows (adds key columns as ordinary columns)
all_parts <- st_load_parts(base, as = "rbind")
all_parts
## data frame with 0 columns and 0 rows
# data.table option (if installed)
if (requireNamespace("data.table", quietly = TRUE)) {
  dt <- st_load_parts(base, as = "dt")
  dt[]
}
## Null data.table (0 rows and 0 cols)

Notes

  • If a partition artifact is not a data.frame, st_load_parts() returns a one-row table with a .object list-column and still appends the key columns.
  • Folder keys (e.g., country=PER/year=2023) should agree with the partition table’s PK columns.

Recommendations & Recipes

  • Safety first: always dry_run = TRUE on large catalogs; store the plan for auditability.
  • Auto vs. manual: prefer auto retention for day-to-day saves; add a manual prune step in release or housekeeping jobs.
  • Project defaults: set options(stamp.retain_versions = ...) in .Rprofile; override at runtime with st_opts().
  • CI: enforce require_pk_on_load = TRUE and run st_prune_versions(..., dry_run = TRUE) to produce a log before destructive steps.
  • Time windows: retention windows are based on created_at recorded at version creation; keep machine clocks sane on shared hosts.

FAQ

Does pruning ever delete the latest version? No. The latest version per artifact is always protected.

What’s the union semantics of list(n, days)? A version is kept if it is among the n most recent or if its created_at falls within the days window.

Where are PKs stored? In each artifact’s sidecar JSON under sibling stmeta/. st_load() re-attaches them as attr(x, "stamp_pk").

Do partition helpers change how retention works? No. Each partition artifact is versioned/pruned independently, inheriting the same retention policies.