Skip to content

Data Loading API

Module for loading CellRanger VDJ and GEX outputs.

loader

Data loading functions for TCRsift.

Handles loading CellRanger VDJ and GEX outputs into unified data structures.

load_cellranger_vdj

load_cellranger_vdj(vdj_dir: str | Path, sample_name: str, annotations_filename: str = 'filtered_contig_annotations.csv', clonotypes_filename: str = 'clonotypes.csv', verbose: bool = True) -> pd.DataFrame

Load CellRanger VDJ output files.

Parameters:

Name Type Description Default
vdj_dir str or Path

Path to CellRanger VDJ output directory

required
sample_name str

Name to assign to this sample

required
annotations_filename str

Name of the contig annotations CSV file

'filtered_contig_annotations.csv'
clonotypes_filename str

Name of the clonotypes CSV file

'clonotypes.csv'
verbose bool

Print progress information

True

Returns:

Type Description
DataFrame

DataFrame with VDJ annotations for all cells

Source code in tcrsift/loader.py
def load_cellranger_vdj(
    vdj_dir: str | Path,
    sample_name: str,
    annotations_filename: str = "filtered_contig_annotations.csv",
    clonotypes_filename: str = "clonotypes.csv",
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Load CellRanger VDJ output files.

    Parameters
    ----------
    vdj_dir : str or Path
        Path to CellRanger VDJ output directory
    sample_name : str
        Name to assign to this sample
    annotations_filename : str
        Name of the contig annotations CSV file
    clonotypes_filename : str
        Name of the clonotypes CSV file
    verbose : bool
        Print progress information

    Returns
    -------
    pd.DataFrame
        DataFrame with VDJ annotations for all cells
    """
    vdj_dir = Path(vdj_dir)

    # Validate directory exists and has expected files
    try:
        vdj_dir = validate_cellranger_vdj_dir(vdj_dir)
    except TCRsiftValidationError:
        # Re-raise with more context
        raise TCRsiftValidationError(
            f"Invalid CellRanger VDJ directory for sample '{sample_name}': {vdj_dir}",
            hint="Make sure this is the 'outs' directory from 'cellranger vdj'. "
            "It should contain 'filtered_contig_annotations.csv' or 'all_contig_annotations.csv'.",
        )

    annotations_path = vdj_dir / annotations_filename
    clonotypes_path = vdj_dir / clonotypes_filename

    if not annotations_path.exists():
        # Try alternative filename
        alt_path = vdj_dir / "all_contig_annotations.csv"
        if alt_path.exists():
            annotations_path = alt_path
            if verbose:
                logger.info("  Using all_contig_annotations.csv (filtered not found)")
        else:
            raise TCRsiftValidationError(
                f"VDJ annotations file not found: {annotations_path}",
                hint=f"Expected one of: {annotations_filename}, all_contig_annotations.csv "
                f"in directory: {vdj_dir}",
            )

    logger.info(f"Loading VDJ annotations from {annotations_path}")
    df = pd.read_csv(annotations_path)

    # Validate the VDJ data
    if len(df) == 0:
        raise TCRsiftValidationError(
            f"VDJ annotations file is empty: {annotations_path}",
            hint="Check that CellRanger VDJ ran successfully. "
            "The file should contain contig annotations.",
        )

    required_cols = ["barcode", "chain"]
    missing_cols = [c for c in required_cols if c not in df.columns]
    if missing_cols:
        raise TCRsiftValidationError(
            f"VDJ annotations missing required columns: {missing_cols}",
            hint=f"Available columns: {list(df.columns)[:15]}. "
            "This doesn't look like a CellRanger VDJ output file.",
        )

    # Log summary statistics
    n_contigs = len(df)
    n_cells = df["barcode"].nunique()
    n_productive = df["productive"].sum() if "productive" in df.columns else "unknown"
    if verbose:
        logger.info(
            f"  Loaded {n_contigs:,} contigs from {n_cells:,} cells ({n_productive} productive)"
        )

    # Validate chain types
    valid_chains = {"TRA", "TRB", "TRD", "TRG", "IGH", "IGK", "IGL", "Multi"}
    invalid_chains = set(df["chain"].unique()) - valid_chains
    if invalid_chains:
        logger.warning(f"  Unexpected chain types found: {invalid_chains}")

    # Add sample information
    df["sample"] = sample_name
    df["vdj_dir"] = str(vdj_dir)

    # Load clonotypes if available
    if clonotypes_path.exists():
        logger.info(f"Loading clonotypes from {clonotypes_path}")
        df_clonotypes = pd.read_csv(clonotypes_path)
        # Clonotypes contain MAIT/NKT evidence
        if "mait_evidence" in df_clonotypes.columns or "inkt_evidence" in df_clonotypes.columns:
            # Merge clonotype info
            clonotype_cols = ["clonotype_id"]
            if "mait_evidence" in df_clonotypes.columns:
                clonotype_cols.append("mait_evidence")
            if "inkt_evidence" in df_clonotypes.columns:
                clonotype_cols.append("inkt_evidence")
            df_clonotypes_subset = df_clonotypes[clonotype_cols].copy()
            df_clonotypes_subset = df_clonotypes_subset.rename(
                columns={"clonotype_id": "raw_clonotype_id"}
            )
            df = df.merge(df_clonotypes_subset, on="raw_clonotype_id", how="left")

    # Combine VDJ segments into full sequence if available
    if all(col in df.columns for col in VDJ_SEGMENT_COLS):
        df["vdj_aa"] = df[VDJ_SEGMENT_COLS].fillna("").agg("".join, axis=1)
    if all(col in df.columns for col in VDJ_SEGMENT_NT_COLS):
        df["vdj_nt"] = df[VDJ_SEGMENT_NT_COLS].fillna("").agg("".join, axis=1)

    return df

load_cellranger_gex

load_cellranger_gex(gex_dir: str | Path, sample_name: str, min_genes: int = 250, max_genes: int = 15000, min_counts: int = 500, max_counts: int = 100000, max_mito_pct: float = 8.0, min_mito_pct: float = 2.0, verbose: bool = True) -> ad.AnnData

Load CellRanger gene expression output.

Parameters:

Name Type Description Default
gex_dir str or Path

Path to CellRanger count output directory

required
sample_name str

Name to assign to this sample

required
min_genes int

Minimum genes detected per cell

250
max_genes int

Maximum genes detected per cell

15000
min_counts int

Minimum UMI counts per cell

500
max_counts int

Maximum UMI counts per cell

100000
max_mito_pct float

Maximum mitochondrial percentage

8.0
min_mito_pct float

Minimum mitochondrial percentage

2.0
verbose bool

Print progress information

True

Returns:

Type Description
AnnData

AnnData object with gene expression data

Source code in tcrsift/loader.py
def load_cellranger_gex(
    gex_dir: str | Path,
    sample_name: str,
    min_genes: int = 250,
    max_genes: int = 15000,
    min_counts: int = 500,
    max_counts: int = 100000,
    max_mito_pct: float = 8.0,
    min_mito_pct: float = 2.0,
    verbose: bool = True,
) -> ad.AnnData:
    """
    Load CellRanger gene expression output.

    Parameters
    ----------
    gex_dir : str or Path
        Path to CellRanger count output directory
    sample_name : str
        Name to assign to this sample
    min_genes : int
        Minimum genes detected per cell
    max_genes : int
        Maximum genes detected per cell
    min_counts : int
        Minimum UMI counts per cell
    max_counts : int
        Maximum UMI counts per cell
    max_mito_pct : float
        Maximum mitochondrial percentage
    min_mito_pct : float
        Minimum mitochondrial percentage
    verbose : bool
        Print progress information

    Returns
    -------
    ad.AnnData
        AnnData object with gene expression data
    """
    # Validate numeric parameters
    validate_numeric_param(min_genes, "min_genes", min_value=0)
    validate_numeric_param(max_genes, "max_genes", min_value=1)
    validate_numeric_param(min_counts, "min_counts", min_value=0)
    validate_numeric_param(max_counts, "max_counts", min_value=1)
    validate_numeric_param(min_mito_pct, "min_mito_pct", min_value=0, max_value=100)
    validate_numeric_param(max_mito_pct, "max_mito_pct", min_value=0, max_value=100)

    if min_genes > max_genes:
        raise TCRsiftValidationError(
            f"min_genes ({min_genes}) cannot be greater than max_genes ({max_genes})",
            hint="Check your QC filter parameters.",
        )
    if min_counts > max_counts:
        raise TCRsiftValidationError(
            f"min_counts ({min_counts}) cannot be greater than max_counts ({max_counts})",
            hint="Check your QC filter parameters.",
        )
    if min_mito_pct > max_mito_pct:
        raise TCRsiftValidationError(
            f"min_mito_pct ({min_mito_pct}) cannot be greater than max_mito_pct ({max_mito_pct})",
            hint="Check your QC filter parameters.",
        )

    gex_dir = Path(gex_dir)

    # Validate directory
    try:
        gex_dir = validate_cellranger_gex_dir(gex_dir)
    except TCRsiftValidationError:
        raise TCRsiftValidationError(
            f"Invalid CellRanger GEX directory for sample '{sample_name}': {gex_dir}",
            hint="Make sure this is the 'outs' directory from 'cellranger count'. "
            "It should contain 'filtered_feature_bc_matrix' or 'filtered_feature_bc_matrix.h5'.",
        )

    # Try standard CellRanger output locations
    matrix_dir = gex_dir / "filtered_feature_bc_matrix"
    if not matrix_dir.exists():
        matrix_dir = gex_dir / "outs" / "filtered_feature_bc_matrix"
    if not matrix_dir.exists():
        # Try h5 file
        h5_path = gex_dir / "filtered_feature_bc_matrix.h5"
        if not h5_path.exists():
            h5_path = gex_dir / "outs" / "filtered_feature_bc_matrix.h5"
        if h5_path.exists():
            logger.info(f"Loading GEX from h5 file: {h5_path}")
            adata = sc.read_10x_h5(str(h5_path))
        else:
            available = [f.name for f in gex_dir.iterdir()][:15]
            raise TCRsiftValidationError(
                f"Gene expression matrix not found in: {gex_dir}",
                hint=f"Expected 'filtered_feature_bc_matrix' directory or 'filtered_feature_bc_matrix.h5'. "
                f"Available files/directories: {available}",
            )
    else:
        logger.info(f"Loading GEX from matrix directory: {matrix_dir}")
        adata = sc.read_10x_mtx(str(matrix_dir), var_names="gene_ids")

    # Validate loaded data
    if adata.n_obs == 0:
        raise TCRsiftValidationError(
            f"Gene expression matrix contains no cells: {gex_dir}",
            hint="Check that CellRanger count ran successfully.",
        )

    if verbose:
        logger.info(f"  Loaded {adata.n_obs:,} cells x {adata.n_vars:,} genes")

    # Add sample information
    adata.obs["sample"] = sample_name
    adata.obs["gex_dir"] = str(gex_dir)

    # Calculate QC metrics - detect mitochondrial genes
    # Handle both loading paths:
    # - h5 path: var_names are gene symbols (MT-ND1, etc.)
    # - mtx path with var_names="gene_ids": var_names are ENSEMBL IDs, symbols in var['gene_symbols']
    if "gene_symbols" in adata.var.columns:
        adata.var["mt"] = adata.var["gene_symbols"].str.startswith("MT-")
    else:
        adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True)

    # Rename QC columns for consistency
    if "pct_counts_mt" in adata.obs.columns:
        adata.obs["percent_mt"] = adata.obs["pct_counts_mt"]
    if "n_genes_by_counts" in adata.obs.columns:
        adata.obs["n_genes"] = adata.obs["n_genes_by_counts"]
    if "total_counts" in adata.obs.columns:
        adata.obs["n_counts"] = adata.obs["total_counts"]

    # Add QC filter flags
    adata.obs["filter:min_genes"] = adata.obs["n_genes"] >= min_genes
    adata.obs["filter:max_genes"] = adata.obs["n_genes"] <= max_genes
    adata.obs["filter:min_counts"] = adata.obs["n_counts"] >= min_counts
    adata.obs["filter:max_counts"] = adata.obs["n_counts"] <= max_counts
    adata.obs["filter:min_mito"] = adata.obs["percent_mt"] >= min_mito_pct
    adata.obs["filter:max_mito"] = adata.obs["percent_mt"] <= max_mito_pct
    adata.obs["filter:pass_qc"] = (
        adata.obs["filter:min_genes"]
        & adata.obs["filter:max_genes"]
        & adata.obs["filter:min_counts"]
        & adata.obs["filter:max_counts"]
        & adata.obs["filter:min_mito"]
        & adata.obs["filter:max_mito"]
    )

    return adata

load_sample

load_sample(sample: Sample, min_genes: int = 250, max_genes: int = 15000, min_counts: int = 500, max_counts: int = 100000, max_mito_pct: float = 8.0, min_mito_pct: float = 2.0) -> ad.AnnData | None

Load all data for a single sample.

Parameters:

Name Type Description Default
sample Sample

Sample object with paths and metadata

required
min_genes int

QC filter parameters for GEX data

250
max_genes int

QC filter parameters for GEX data

250
min_counts int

QC filter parameters for GEX data

250
max_counts int

QC filter parameters for GEX data

250
max_mito_pct int

QC filter parameters for GEX data

250
min_mito_pct int

QC filter parameters for GEX data

250

Returns:

Type Description
AnnData or None

Combined AnnData with GEX and VDJ data, or None if sample has neither gex_dir nor vdj_dir.

Source code in tcrsift/loader.py
def load_sample(
    sample: Sample,
    min_genes: int = 250,
    max_genes: int = 15000,
    min_counts: int = 500,
    max_counts: int = 100000,
    max_mito_pct: float = 8.0,
    min_mito_pct: float = 2.0,
) -> ad.AnnData | None:
    """
    Load all data for a single sample.

    Parameters
    ----------
    sample : Sample
        Sample object with paths and metadata
    min_genes, max_genes, min_counts, max_counts, max_mito_pct, min_mito_pct
        QC filter parameters for GEX data

    Returns
    -------
    ad.AnnData or None
        Combined AnnData with GEX and VDJ data, or None if sample has neither
        gex_dir nor vdj_dir.
    """
    adata = None
    vdj_df = None

    # Load GEX if available
    if sample.gex_dir:
        adata = load_cellranger_gex(
            sample.gex_dir,
            sample.sample,
            min_genes=min_genes,
            max_genes=max_genes,
            min_counts=min_counts,
            max_counts=max_counts,
            max_mito_pct=max_mito_pct,
            min_mito_pct=min_mito_pct,
        )

    # Load VDJ if available
    if sample.vdj_dir:
        vdj_df = load_cellranger_vdj(sample.vdj_dir, sample.sample)

    # Combine or create from VDJ only
    if adata is not None and vdj_df is not None:
        adata = combine_gex_and_vdj(adata, vdj_df)
    elif vdj_df is not None:
        # Create minimal AnnData from VDJ data
        vdj_pivoted = _pivot_vdj_by_barcode(vdj_df)
        adata = ad.AnnData(obs=vdj_pivoted)
        adata.obs["sample"] = sample.sample

    # Add sample metadata
    if adata is not None:
        adata.obs["antigen_type"] = sample.antigen_type
        adata.obs["antigen_description"] = sample.antigen_description
        adata.obs["antigen_name"] = sample.antigen_name
        adata.obs["antigen_sequence"] = sample.antigen_sequence
        adata.obs["epitope_sequence"] = sample.epitope_sequence
        adata.obs["mhc_allele"] = sample.mhc_allele
        adata.obs["antigen_names"] = sample.antigen_names
        adata.obs["antigen_sequences"] = sample.antigen_sequences
        adata.obs["epitope_sequences"] = sample.epitope_sequences
        adata.obs["source"] = sample.source
        adata.obs["expected_tcell_type"] = sample.get_expected_tcell_type()

    return adata

load_samples

load_samples(sample_sheet_path: str | Path | SampleSheet, min_genes: int = 250, max_genes: int = 15000, min_counts: int = 500, max_counts: int = 100000, max_mito_pct: float = 8.0, min_mito_pct: float = 2.0, verbose: bool = True, show_progress: bool = True) -> ad.AnnData

Load all samples from a sample sheet into a single AnnData object.

Parameters:

Name Type Description Default
sample_sheet_path str or Path or SampleSheet

Path to sample sheet (CSV or YAML), or a SampleSheet instance

required
min_genes int

QC filter parameters

250
max_genes int

QC filter parameters

250
min_counts int

QC filter parameters

250
max_counts int

QC filter parameters

250
max_mito_pct int

QC filter parameters

250
min_mito_pct int

QC filter parameters

250
verbose bool

Print detailed progress information

True
show_progress bool

Show progress bar

True

Returns:

Type Description
AnnData

Combined AnnData with all samples

Source code in tcrsift/loader.py
def load_samples(
    sample_sheet_path: str | Path | SampleSheet,
    min_genes: int = 250,
    max_genes: int = 15000,
    min_counts: int = 500,
    max_counts: int = 100000,
    max_mito_pct: float = 8.0,
    min_mito_pct: float = 2.0,
    verbose: bool = True,
    show_progress: bool = True,
) -> ad.AnnData:
    """
    Load all samples from a sample sheet into a single AnnData object.

    Parameters
    ----------
    sample_sheet_path : str or Path or SampleSheet
        Path to sample sheet (CSV or YAML), or a SampleSheet instance
    min_genes, max_genes, min_counts, max_counts, max_mito_pct, min_mito_pct
        QC filter parameters
    verbose : bool
        Print detailed progress information
    show_progress : bool
        Show progress bar

    Returns
    -------
    ad.AnnData
        Combined AnnData with all samples
    """
    # Load sample sheet (path or object)
    if isinstance(sample_sheet_path, SampleSheet):
        sample_sheet = sample_sheet_path
        sample_sheet_label = "<SampleSheet>"
    else:
        sample_sheet_path = validate_file_exists(sample_sheet_path, "sample sheet")
        sample_sheet = load_sample_sheet(sample_sheet_path)
        sample_sheet_label = str(sample_sheet_path)

    if len(sample_sheet) == 0:
        raise TCRsiftValidationError(
            f"Sample sheet is empty: {sample_sheet_label}",
            hint="Add sample entries to the sample sheet.",
        )

    logger.info(f"Loading {len(sample_sheet)} samples from {sample_sheet_label}")

    # Pre-validate all sample paths to fail fast
    validation_errors = []
    for sample in sample_sheet:
        if sample.vdj_dir and not Path(sample.vdj_dir).exists():
            validation_errors.append(
                f"Sample '{sample.sample}': VDJ directory not found: {sample.vdj_dir}"
            )
        if sample.gex_dir and not Path(sample.gex_dir).exists():
            validation_errors.append(
                f"Sample '{sample.sample}': GEX directory not found: {sample.gex_dir}"
            )

    if validation_errors:
        raise TCRsiftValidationError(
            f"Sample sheet validation failed with {len(validation_errors)} error(s):\n"
            + "\n".join(f"  - {e}" for e in validation_errors[:5]),
            hint="Check that all paths in the sample sheet are correct and accessible.",
        )

    adatas = []
    total_cells = 0

    # Create progress bar iterator
    sample_iter = sample_sheet
    if show_progress:
        sample_iter = tqdm(
            sample_sheet,
            desc="Loading samples",
            unit="sample",
            disable=not show_progress,
        )

    for sample in sample_iter:
        if show_progress:
            sample_iter.set_postfix(sample=sample.sample[:20])

        if verbose:
            logger.info(f"Loading sample: {sample.sample}")

        try:
            adata = load_sample(
                sample,
                min_genes=min_genes,
                max_genes=max_genes,
                min_counts=min_counts,
                max_counts=max_counts,
                max_mito_pct=max_mito_pct,
                min_mito_pct=min_mito_pct,
            )
            if adata is not None:
                adatas.append(adata)
                total_cells += adata.n_obs
                if verbose:
                    logger.info(f"  Sample {sample.sample}: {adata.n_obs:,} cells")
        except TCRsiftValidationError:
            raise
        except Exception as e:
            raise TCRsiftValidationError(
                f"Failed to load sample '{sample.sample}': {e}",
                hint="Check that the CellRanger output directories are valid and complete. "
                f"VDJ: {sample.vdj_dir}, GEX: {sample.gex_dir}",
            ) from e

    if not adatas:
        raise TCRsiftValidationError(
            "No samples loaded successfully",
            hint="Check that at least one sample has valid VDJ or GEX data.",
        )

    # Concatenate all samples
    logger.info(f"Concatenating {len(adatas)} samples ({total_cells:,} total cells)")

    if show_progress:
        # Show progress for concatenation (can be slow for large datasets)
        with tqdm(total=1, desc="Concatenating samples", unit="step") as pbar:
            combined = ad.concat(
                adatas,
                join="outer",
                label="sample",
                keys=[a.obs["sample"].iloc[0] for a in adatas],
            )
            pbar.update(1)
    else:
        combined = ad.concat(
            adatas,
            join="outer",
            label="sample",
            keys=[a.obs["sample"].iloc[0] for a in adatas],
        )

    # Store sample sheet as uns
    combined.uns["sample_sheet"] = sample_sheet.to_dataframe().to_dict()

    logger.info(f"Successfully loaded {combined.n_obs:,} cells from {len(adatas)} samples")

    return combined