Data Loading API¶

Module for loading CellRanger VDJ and GEX outputs.

loader ¶

Data loading functions for TCRsift.

Handles loading CellRanger VDJ and GEX outputs into unified data structures.

load_cellranger_vdj ¶

load_cellranger_vdj(vdj_dir: str | Path, sample_name: str, annotations_filename: str = 'filtered_contig_annotations.csv', clonotypes_filename: str = 'clonotypes.csv', verbose: bool = True) -> pd.DataFrame

Load CellRanger VDJ output files.

Parameters:

Name	Type	Description	Default
`vdj_dir`	`str or Path`	Path to CellRanger VDJ output directory	required
`sample_name`	`str`	Name to assign to this sample	required
`annotations_filename`	`str`	Name of the contig annotations CSV file	`'filtered_contig_annotations.csv'`
`clonotypes_filename`	`str`	Name of the clonotypes CSV file	`'clonotypes.csv'`
`verbose`	`bool`	Print progress information	`True`

Returns:

Type	Description
`DataFrame`	DataFrame with VDJ annotations for all cells

Source code in tcrsift/loader.py

def load_cellranger_vdj(
    vdj_dir: str | Path,
    sample_name: str,
    annotations_filename: str = "filtered_contig_annotations.csv",
    clonotypes_filename: str = "clonotypes.csv",
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Load CellRanger VDJ output files.

    Parameters
    ----------
    vdj_dir : str or Path
        Path to CellRanger VDJ output directory
    sample_name : str
        Name to assign to this sample
    annotations_filename : str
        Name of the contig annotations CSV file
    clonotypes_filename : str
        Name of the clonotypes CSV file
    verbose : bool
        Print progress information

    Returns
    -------
    pd.DataFrame
        DataFrame with VDJ annotations for all cells
    """
    vdj_dir = Path(vdj_dir)

    # Validate directory exists and has expected files
    try:
        vdj_dir = validate_cellranger_vdj_dir(vdj_dir)
    except TCRsiftValidationError:
        # Re-raise with more context
        raise TCRsiftValidationError(
            f"Invalid CellRanger VDJ directory for sample '{sample_name}': {vdj_dir}",
            hint="Make sure this is the 'outs' directory from 'cellranger vdj'. "
            "It should contain 'filtered_contig_annotations.csv' or 'all_contig_annotations.csv'.",
        )

    annotations_path = vdj_dir / annotations_filename
    clonotypes_path = vdj_dir / clonotypes_filename

    if not annotations_path.exists():
        # Try alternative filename
        alt_path = vdj_dir / "all_contig_annotations.csv"
        if alt_path.exists():
            annotations_path = alt_path
            if verbose:
                logger.info("  Using all_contig_annotations.csv (filtered not found)")
        else:
            raise TCRsiftValidationError(
                f"VDJ annotations file not found: {annotations_path}",
                hint=f"Expected one of: {annotations_filename}, all_contig_annotations.csv "
                f"in directory: {vdj_dir}",
            )

    logger.info(f"Loading VDJ annotations from {annotations_path}")
    df = pd.read_csv(annotations_path)

    # Validate the VDJ data
    if len(df) == 0:
        raise TCRsiftValidationError(
            f"VDJ annotations file is empty: {annotations_path}",
            hint="Check that CellRanger VDJ ran successfully. "
            "The file should contain contig annotations.",
        )

    required_cols = ["barcode", "chain"]
    missing_cols = [c for c in required_cols if c not in df.columns]
    if missing_cols:
        raise TCRsiftValidationError(
            f"VDJ annotations missing required columns: {missing_cols}",
            hint=f"Available columns: {list(df.columns)[:15]}. "
            "This doesn't look like a CellRanger VDJ output file.",
        )

    # Log summary statistics
    n_contigs = len(df)
    n_cells = df["barcode"].nunique()
    if "productive" in df.columns:
        productive = df["productive"]
        if productive.dtype == bool:
            n_productive = int(productive.sum())
        else:
            n_productive = int((productive.astype(str) == "True").sum())
    else:
        n_productive = "unknown"
    if verbose:
        logger.info(
            f"  Loaded {n_contigs:,} contigs from {n_cells:,} cells ({n_productive} productive)"
        )

    # Validate chain types
    valid_chains = {"TRA", "TRB", "TRD", "TRG", "IGH", "IGK", "IGL", "Multi"}
    invalid_chains = set(df["chain"].unique()) - valid_chains
    if invalid_chains:
        logger.warning(f"  Unexpected chain types found: {invalid_chains}")

    # Add sample information
    df["sample"] = sample_name
    df["vdj_dir"] = str(vdj_dir)

    # Load clonotypes if available
    if clonotypes_path.exists():
        logger.info(f"Loading clonotypes from {clonotypes_path}")
        df_clonotypes = pd.read_csv(clonotypes_path)
        # Clonotypes contain MAIT/NKT evidence
        if "mait_evidence" in df_clonotypes.columns or "inkt_evidence" in df_clonotypes.columns:
            # Merge clonotype info
            clonotype_cols = ["clonotype_id"]
            if "mait_evidence" in df_clonotypes.columns:
                clonotype_cols.append("mait_evidence")
            if "inkt_evidence" in df_clonotypes.columns:
                clonotype_cols.append("inkt_evidence")
            df_clonotypes_subset = df_clonotypes[clonotype_cols].copy()
            df_clonotypes_subset = df_clonotypes_subset.rename(
                columns={"clonotype_id": "raw_clonotype_id"}
            )
            # Drop duplicate clonotype ids before the left merge: a malformed /
            # concatenated clonotypes.csv with a repeated id would otherwise
            # fan-out (duplicate) every contig row for that clonotype.
            df_clonotypes_subset = df_clonotypes_subset.drop_duplicates(
                "raw_clonotype_id"
            )
            df = df.merge(df_clonotypes_subset, on="raw_clonotype_id", how="left")

    # Combine VDJ segments into full sequence if available
    if all(col in df.columns for col in VDJ_SEGMENT_COLS):
        df["vdj_aa"] = df[VDJ_SEGMENT_COLS].fillna("").agg("".join, axis=1)
    if all(col in df.columns for col in VDJ_SEGMENT_NT_COLS):
        df["vdj_nt"] = df[VDJ_SEGMENT_NT_COLS].fillna("").agg("".join, axis=1)

    return df

load_cellranger_gex ¶

load_cellranger_gex(gex_dir: str | Path, sample_name: str, min_genes: int = 250, max_genes: int = 15000, min_counts: int = 500, max_counts: int = 100000, max_mito_pct: float = 8.0, min_mito_pct: float = 2.0, verbose: bool = True) -> ad.AnnData

Load CellRanger gene expression output.

Parameters:

Name	Type	Description	Default
`gex_dir`	`str or Path`	Path to CellRanger count output directory	required
`sample_name`	`str`	Name to assign to this sample	required
`min_genes`	`int`	Minimum genes detected per cell	`250`
`max_genes`	`int`	Maximum genes detected per cell	`15000`
`min_counts`	`int`	Minimum UMI counts per cell	`500`
`max_counts`	`int`	Maximum UMI counts per cell	`100000`
`max_mito_pct`	`float`	Maximum mitochondrial percentage	`8.0`
`min_mito_pct`	`float`	Minimum mitochondrial percentage (a FLOOR: cells below this are dropped, targeting near-zero-mito empty/ambient droplets). Default 2.0; the drop is logged as a warning so it isn't silent (#168). Set 0 to disable the floor.	`2.0`
`verbose`	`bool`	Print progress information	`True`

Returns:

Type	Description
`AnnData`	AnnData object with gene expression data

Source code in tcrsift/loader.py

def load_cellranger_gex(
    gex_dir: str | Path,
    sample_name: str,
    min_genes: int = 250,
    max_genes: int = 15000,
    min_counts: int = 500,
    max_counts: int = 100000,
    max_mito_pct: float = 8.0,
    min_mito_pct: float = 2.0,
    verbose: bool = True,
) -> ad.AnnData:
    """
    Load CellRanger gene expression output.

    Parameters
    ----------
    gex_dir : str or Path
        Path to CellRanger count output directory
    sample_name : str
        Name to assign to this sample
    min_genes : int
        Minimum genes detected per cell
    max_genes : int
        Maximum genes detected per cell
    min_counts : int
        Minimum UMI counts per cell
    max_counts : int
        Maximum UMI counts per cell
    max_mito_pct : float
        Maximum mitochondrial percentage
    min_mito_pct : float
        Minimum mitochondrial percentage (a FLOOR: cells below this are
        dropped, targeting near-zero-mito empty/ambient droplets). Default 2.0;
        the drop is logged as a warning so it isn't silent (#168). Set 0 to
        disable the floor.
    verbose : bool
        Print progress information

    Returns
    -------
    ad.AnnData
        AnnData object with gene expression data
    """
    # Validate numeric parameters
    validate_numeric_param(min_genes, "min_genes", min_value=0)
    validate_numeric_param(max_genes, "max_genes", min_value=1)
    validate_numeric_param(min_counts, "min_counts", min_value=0)
    validate_numeric_param(max_counts, "max_counts", min_value=1)
    validate_numeric_param(min_mito_pct, "min_mito_pct", min_value=0, max_value=100)
    validate_numeric_param(max_mito_pct, "max_mito_pct", min_value=0, max_value=100)

    if min_genes > max_genes:
        raise TCRsiftValidationError(
            f"min_genes ({min_genes}) cannot be greater than max_genes ({max_genes})",
            hint="Check your QC filter parameters.",
        )
    if min_counts > max_counts:
        raise TCRsiftValidationError(
            f"min_counts ({min_counts}) cannot be greater than max_counts ({max_counts})",
            hint="Check your QC filter parameters.",
        )
    if min_mito_pct > max_mito_pct:
        raise TCRsiftValidationError(
            f"min_mito_pct ({min_mito_pct}) cannot be greater than max_mito_pct ({max_mito_pct})",
            hint="Check your QC filter parameters.",
        )

    gex_dir = Path(gex_dir)

    # Validate directory
    try:
        gex_dir = validate_cellranger_gex_dir(gex_dir)
    except TCRsiftValidationError:
        raise TCRsiftValidationError(
            f"Invalid CellRanger GEX directory for sample '{sample_name}': {gex_dir}",
            hint="Make sure this is the 'outs' directory from 'cellranger count'. "
            "It should contain 'filtered_feature_bc_matrix' or 'filtered_feature_bc_matrix.h5'.",
        )

    # Try standard CellRanger output locations
    matrix_dir = gex_dir / "filtered_feature_bc_matrix"
    if not matrix_dir.exists():
        matrix_dir = gex_dir / "outs" / "filtered_feature_bc_matrix"
    if not matrix_dir.exists():
        # Try h5 file
        h5_path = gex_dir / "filtered_feature_bc_matrix.h5"
        if not h5_path.exists():
            h5_path = gex_dir / "outs" / "filtered_feature_bc_matrix.h5"
        if h5_path.exists():
            logger.info(f"Loading GEX from h5 file: {h5_path}")
            adata = sc.read_10x_h5(str(h5_path))
        else:
            available = [f.name for f in gex_dir.iterdir()][:15]
            raise TCRsiftValidationError(
                f"Gene expression matrix not found in: {gex_dir}",
                hint=f"Expected 'filtered_feature_bc_matrix' directory or 'filtered_feature_bc_matrix.h5'. "
                f"Available files/directories: {available}",
            )
    else:
        logger.info(f"Loading GEX from matrix directory: {matrix_dir}")
        adata = sc.read_10x_mtx(str(matrix_dir), var_names="gene_ids")

    # Validate loaded data
    if adata.n_obs == 0:
        raise TCRsiftValidationError(
            f"Gene expression matrix contains no cells: {gex_dir}",
            hint="Check that CellRanger count ran successfully.",
        )

    if verbose:
        logger.info(f"  Loaded {adata.n_obs:,} cells x {adata.n_vars:,} genes")

    # Add sample information
    adata.obs["sample"] = sample_name
    adata.obs["gex_dir"] = str(gex_dir)

    # Calculate QC metrics - detect mitochondrial genes
    # Handle both loading paths:
    # - h5 path: var_names are gene symbols (MT-ND1, etc.)
    # - mtx path with var_names="gene_ids": var_names are ENSEMBL IDs, symbols in var['gene_symbols']
    if "gene_symbols" in adata.var.columns:
        adata.var["mt"] = adata.var["gene_symbols"].str.startswith("MT-")
    else:
        adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True)

    # Rename QC columns for consistency
    if "pct_counts_mt" in adata.obs.columns:
        adata.obs["percent_mt"] = adata.obs["pct_counts_mt"]
    if "n_genes_by_counts" in adata.obs.columns:
        adata.obs["n_genes"] = adata.obs["n_genes_by_counts"]
    if "total_counts" in adata.obs.columns:
        adata.obs["n_counts"] = adata.obs["total_counts"]

    # Add QC filter flags
    adata.obs["filter:min_genes"] = adata.obs["n_genes"] >= min_genes
    adata.obs["filter:max_genes"] = adata.obs["n_genes"] <= max_genes
    adata.obs["filter:min_counts"] = adata.obs["n_counts"] >= min_counts
    adata.obs["filter:max_counts"] = adata.obs["n_counts"] <= max_counts
    adata.obs["filter:min_mito"] = adata.obs["percent_mt"] >= min_mito_pct
    adata.obs["filter:max_mito"] = adata.obs["percent_mt"] <= max_mito_pct

    # The min_mito_pct FLOOR discards *low*-mito cells (near-zero-mito
    # empty/ambient droplets). The default 2.0 floor trims a small tail on most
    # datasets — that's the intended behavior, so report it at info level rather
    # than warning on every run (#168). Only escalate to a warning when the floor
    # culls an unusually large fraction (>10%), which signals it's set too high.
    if min_mito_pct > 0:
        n_below_floor = int((~adata.obs["filter:min_mito"]).sum())
        if n_below_floor:
            frac = n_below_floor / adata.n_obs if adata.n_obs else 0.0
            msg = (
                f"  min_mito_pct={min_mito_pct} FLOOR dropped "
                f"{n_below_floor:,}/{adata.n_obs:,} ({frac * 100:.0f}%) low-mito cells"
            )
            if frac > 0.10:
                logger.warning(
                    msg + " — a >10% cull from the mito floor is unusual; "
                    "set min_mito_pct=0 to disable it."
                )
            elif verbose:
                logger.info(msg)

    adata.obs["filter:pass_qc"] = (
        adata.obs["filter:min_genes"]
        & adata.obs["filter:max_genes"]
        & adata.obs["filter:min_counts"]
        & adata.obs["filter:max_counts"]
        & adata.obs["filter:min_mito"]
        & adata.obs["filter:max_mito"]
    )

    # Apply the QC mask. Prior to issue #39 these flags were advisory only —
    # the column was computed but cells failing QC were not dropped, so the
    # user-facing --min-mito/--max-mito/--min-genes/etc. parameters were
    # silently no-ops.
    n_before = adata.n_obs
    adata = adata[adata.obs["filter:pass_qc"]].copy()
    if verbose:
        logger.info(
            f"  QC: {n_before:,} -> {adata.n_obs:,} cells pass "
            f"(dropped {n_before - adata.n_obs:,})"
        )
    # Catch "silently wrong" config mistakes (e.g. --min-mito 1.0 typo for
    # 10) where the filter eats most of the sample and downstream artifacts
    # just look weird rather than failing clearly (#41). 50% is a reasonable
    # line — peripheral blood routinely passes 90%+, tumor sits around 70%,
    # below half is almost always a config or dataset-mismatch issue.
    pass_rate = adata.n_obs / n_before if n_before else 0.0
    if pass_rate < 0.5:
        logger.warning(
            f"  QC dropped {(1 - pass_rate) * 100:.0f}% of cells "
            f"({n_before - adata.n_obs:,}/{n_before:,}) — "
            "check --min-genes/--max-genes/--min-counts/--max-counts/"
            "--min-mito/--max-mito if this is unexpected."
        )

    return adata

load_sample ¶

load_sample(sample: Sample, min_genes: int = 250, max_genes: int = 15000, min_counts: int = 500, max_counts: int = 100000, max_mito_pct: float = 8.0, min_mito_pct: float = 2.0) -> ad.AnnData | None

Load all data for a single sample.

Parameters:

Name	Type	Description	Default
`sample`	`Sample`	Sample object with paths and metadata	required
`min_genes`	`int`	QC filter parameters for GEX data	`250`
`max_genes`	`int`	QC filter parameters for GEX data	`250`
`min_counts`	`int`	QC filter parameters for GEX data	`250`
`max_counts`	`int`	QC filter parameters for GEX data	`250`
`max_mito_pct`	`int`	QC filter parameters for GEX data	`250`
`min_mito_pct`	`int`	QC filter parameters for GEX data	`250`

Returns:

Type	Description
`AnnData or None`	Combined AnnData with GEX and VDJ data, or None if sample has neither gex_dir nor vdj_dir.

Source code in tcrsift/loader.py

def load_sample(
    sample: Sample,
    min_genes: int = 250,
    max_genes: int = 15000,
    min_counts: int = 500,
    max_counts: int = 100000,
    max_mito_pct: float = 8.0,
    min_mito_pct: float = 2.0,
) -> ad.AnnData | None:
    """
    Load all data for a single sample.

    Parameters
    ----------
    sample : Sample
        Sample object with paths and metadata
    min_genes, max_genes, min_counts, max_counts, max_mito_pct, min_mito_pct
        QC filter parameters for GEX data

    Returns
    -------
    ad.AnnData or None
        Combined AnnData with GEX and VDJ data, or None if sample has neither
        gex_dir nor vdj_dir.
    """
    adata = None
    vdj_df = None

    # Load GEX if available
    if sample.gex_dir:
        adata = load_cellranger_gex(
            sample.gex_dir,
            sample.sample,
            min_genes=min_genes,
            max_genes=max_genes,
            min_counts=min_counts,
            max_counts=max_counts,
            max_mito_pct=max_mito_pct,
            min_mito_pct=min_mito_pct,
        )

    # Load VDJ if available
    if sample.vdj_dir:
        vdj_df = load_cellranger_vdj(sample.vdj_dir, sample.sample)

    # Combine or create from VDJ only
    if adata is not None and vdj_df is not None:
        adata = combine_gex_and_vdj(adata, vdj_df)
    elif vdj_df is not None:
        # Create minimal AnnData from VDJ data
        vdj_pivoted = _pivot_vdj_by_barcode(vdj_df)
        adata = ad.AnnData(obs=vdj_pivoted)
        adata.obs["sample"] = sample.sample

    # Add sample metadata. Skip fields the user didn't supply: an all-None
    # object column survives concat but breaks anndata.write_h5ad, and
    # downstream readers already guard with `if col in adata.obs.columns`.
    if adata is not None:
        metadata = [
            ("antigen_type", sample.antigen_type),
            ("antigen_description", sample.antigen_description),
            ("antigen_name", sample.antigen_name),
            ("antigen_sequence", sample.antigen_sequence),
            ("epitope_sequence", sample.epitope_sequence),
            ("mhc_allele", sample.mhc_allele),
            ("antigen_names", sample.antigen_names),
            ("antigen_sequences", sample.antigen_sequences),
            ("epitope_sequences", sample.epitope_sequences),
            ("source", sample.source),
            ("patient_id", sample.patient_id),
            ("enrichment_method", sample.enrichment_method),
            ("timepoint", sample.timepoint),
            ("apc_type", sample.apc_type),
            ("tissue", sample.tissue),
            ("expected_tcell_type", sample.get_expected_tcell_type()),
        ]
        _add_obs_columns(
            adata, {col: val for col, val in metadata if val is not None}
        )

    return adata

load_samples ¶

load_samples(sample_sheet_path: str | Path | SampleSheet, min_genes: int = 250, max_genes: int = 15000, min_counts: int = 500, max_counts: int = 100000, max_mito_pct: float = 8.0, min_mito_pct: float = 2.0, verbose: bool = True, show_progress: bool = True, tmpdir: str | Path | None = None) -> ad.AnnData

Load all samples from a sample sheet into a single AnnData object.

Parameters:

Name	Type	Description	Default
`sample_sheet_path`	`str or Path or SampleSheet`	Path to sample sheet (CSV or YAML), or a SampleSheet instance.	required
`min_genes`	`int`	QC filter parameters.	`250`
`max_genes`	`int`	QC filter parameters.	`250`
`min_counts`	`int`	QC filter parameters.	`250`
`max_counts`	`int`	QC filter parameters.	`250`
`max_mito_pct`	`int`	QC filter parameters.	`250`
`min_mito_pct`	`int`	QC filter parameters.	`250`
`verbose`	`bool`	Print detailed progress information.	`True`
`show_progress`	`bool`	Show progress bar.	`True`
`tmpdir`	`str or Path or None`	Parent directory for the spill tempdir (see Notes). Defaults to the system temp location ($TMPDIR / /tmp). Pass an explicit disk-backed directory when the system temp is on tmpfs.	`None`

Returns:

Type	Description
`AnnData`	Combined AnnData with all samples. `X` is None when every input was VDJ-only, otherwise it carries the outer-joined expression matrix.

Notes

Memory: each per-sample AnnData is spilled to a tempfile h5ad after load and the in-memory copy is freed, then merged with anndata.experimental.concat_on_disk, which streams inputs and output in bounded chunks (~400 MB default). The merged file is read back into memory once. Peak memory ≈ max(one sample, output), versus the prior in-memory ad.concat peak of ~2 × Σ(samples).

Temp disk: spilled per-sample h5ads plus the merged output, so ~2× the total sparse dataset size. Redirect with tmpdir= when $TMPDIR is on tmpfs.

VDJ-only samples: see _ensure_x — a zero-column placeholder is synthesized to keep the merge path uniform, then stripped after the merge so combined.X is None for VDJ-only loads.

Source code in tcrsift/loader.py

def load_samples(
    sample_sheet_path: str | Path | SampleSheet,
    min_genes: int = 250,
    max_genes: int = 15000,
    min_counts: int = 500,
    max_counts: int = 100000,
    max_mito_pct: float = 8.0,
    min_mito_pct: float = 2.0,
    verbose: bool = True,
    show_progress: bool = True,
    tmpdir: str | Path | None = None,
) -> ad.AnnData:
    """
    Load all samples from a sample sheet into a single AnnData object.

    Parameters
    ----------
    sample_sheet_path : str or Path or SampleSheet
        Path to sample sheet (CSV or YAML), or a SampleSheet instance.
    min_genes, max_genes, min_counts, max_counts, max_mito_pct, min_mito_pct
        QC filter parameters.
    verbose : bool
        Print detailed progress information.
    show_progress : bool
        Show progress bar.
    tmpdir : str or Path or None
        Parent directory for the spill tempdir (see Notes). Defaults to the
        system temp location ($TMPDIR / /tmp). Pass an explicit disk-backed
        directory when the system temp is on tmpfs.

    Returns
    -------
    ad.AnnData
        Combined AnnData with all samples. `X` is None when every input was
        VDJ-only, otherwise it carries the outer-joined expression matrix.

    Notes
    -----
    Memory: each per-sample AnnData is spilled to a tempfile h5ad after
    load and the in-memory copy is freed, then merged with
    `anndata.experimental.concat_on_disk`, which streams inputs and output
    in bounded chunks (~400 MB default). The merged file is read back into
    memory once. Peak memory ≈ max(one sample, output), versus the prior
    in-memory `ad.concat` peak of ~2 × Σ(samples).

    Temp disk: spilled per-sample h5ads plus the merged output, so ~2× the
    total sparse dataset size. Redirect with `tmpdir=` when $TMPDIR is on
    tmpfs.

    VDJ-only samples: see `_ensure_x` — a zero-column placeholder is
    synthesized to keep the merge path uniform, then stripped after the
    merge so `combined.X` is None for VDJ-only loads.
    """
    # Load sample sheet (path or object)
    if isinstance(sample_sheet_path, SampleSheet):
        sample_sheet = sample_sheet_path
        sample_sheet_label = "<SampleSheet>"
    else:
        sample_sheet_path = validate_file_exists(sample_sheet_path, "sample sheet")
        sample_sheet = load_sample_sheet(sample_sheet_path)
        sample_sheet_label = str(sample_sheet_path)

    if len(sample_sheet) == 0:
        raise TCRsiftValidationError(
            f"Sample sheet is empty: {sample_sheet_label}",
            hint="Add sample entries to the sample sheet.",
        )

    logger.info(f"Loading {len(sample_sheet)} samples from {sample_sheet_label}")

    # Pre-validate all sample paths to fail fast
    validation_errors = []
    for sample in sample_sheet:
        if sample.vdj_dir and not Path(sample.vdj_dir).exists():
            validation_errors.append(
                f"Sample '{sample.sample}': VDJ directory not found: {sample.vdj_dir}"
            )
        if sample.gex_dir and not Path(sample.gex_dir).exists():
            validation_errors.append(
                f"Sample '{sample.sample}': GEX directory not found: {sample.gex_dir}"
            )

    if validation_errors:
        raise TCRsiftValidationError(
            f"Sample sheet validation failed with {len(validation_errors)} error(s):\n"
            + "\n".join(f"  - {e}" for e in validation_errors[:5]),
            hint="Check that all paths in the sample sheet are correct and accessible.",
        )

    sample_keys: list[str] = []
    spill_paths: list[Path] = []
    total_cells = 0

    with tempfile.TemporaryDirectory(prefix="tcrsift_load_", dir=tmpdir) as spill_dir_str:
        spill_dir = Path(spill_dir_str)
        sample_iter = tqdm(
            sample_sheet,
            desc="Loading samples",
            unit="sample",
            disable=not show_progress,
        )

        for sample in sample_iter:
            sample_iter.set_postfix(sample=sample.sample[:20])

            if verbose:
                logger.info(f"Loading sample: {sample.sample}")

            try:
                adata = load_sample(
                    sample,
                    min_genes=min_genes,
                    max_genes=max_genes,
                    min_counts=min_counts,
                    max_counts=max_counts,
                    max_mito_pct=max_mito_pct,
                    min_mito_pct=min_mito_pct,
                )
            except TCRsiftValidationError:
                raise
            except Exception as e:
                raise TCRsiftValidationError(
                    f"Failed to load sample '{sample.sample}': {e}",
                    hint="Check that the CellRanger output directories are valid and complete. "
                    f"VDJ: {sample.vdj_dir}, GEX: {sample.gex_dir}",
                ) from e

            if adata is None:
                continue

            _ensure_x(adata)
            sample_keys.append(adata.obs["sample"].iloc[0])
            total_cells += adata.n_obs

            # Number files by the load order of successful samples (skipped
            # samples don't leave gaps), so the spill dir is easy to inspect.
            path = spill_dir / f"sample_{len(spill_paths):04d}.h5ad"
            adata.write_h5ad(path)
            spill_paths.append(path)

            if verbose:
                logger.info(f"  Sample {sample.sample}: {adata.n_obs:,} cells")
            del adata  # release the in-memory copy now that it's on disk

        if not sample_keys:
            raise TCRsiftValidationError(
                "No samples loaded successfully",
                hint="Check that at least one sample has valid VDJ or GEX data.",
            )

        logger.info(f"Concatenating {len(sample_keys)} samples ({total_cells:,} total cells)")
        out_path = spill_dir / "combined.h5ad"
        with tqdm(
            total=1, desc="Concatenating samples", unit="step", disable=not show_progress
        ) as pbar:
            concat_on_disk(
                spill_paths, out_path,
                join="outer", label="sample", keys=sample_keys,
            )
            # Non-unique obs_names are EXPECTED and intentional here: 10x reuses
            # one barcode whitelist across samples, and we deliberately do not
            # uniquify (downstream code indexes positionally with a sample-column
            # mask — see qc.py `gex_vdj_overlap`). So anndata's "Observation names
            # are not unique" notice on read-back is noise, not a problem to fix.
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore", message="Observation names are not unique",
                    category=UserWarning,
                )
                combined = ad.read_h5ad(out_path)
            pbar.update(1)

    # Tempdir is cleaned up here; `combined` is fully in-memory.
    # Inverse of `_ensure_x`: an all-VDJ-only sheet produces n_vars == 0,
    # in which case the synthesized empty placeholders should look like the
    # original no-X state to downstream callers.
    if combined.n_vars == 0:
        combined.X = None

    # Store sample sheet as uns as a JSON string. The previous form,
    # `to_dataframe().to_dict()`, produced `{col: {row_idx: val}}` whose
    # int row-index keys break h5ad serialization. Other structured forms
    # (DataFrame directly, `to_dict(orient="list")`) hit anndata/h5py
    # mixed-None-object-column limitations. JSON is unambiguous about
    # None and stores as a single string. Read back with
    # `pd.read_json(io.StringIO(adata.uns["sample_sheet"]), orient="records")`.
    combined.uns["sample_sheet"] = sample_sheet.to_dataframe().to_json(
        orient="records"
    )

    logger.info(f"Successfully loaded {combined.n_obs:,} cells from {len(sample_keys)} samples")

    return combined