Skip to content

Sample Sheet API

Module for parsing and validating sample sheets.

sample_sheet

Sample sheet parsing for TCRsift.

Supports both CSV and YAML formats for specifying samples and their metadata.

ANTIGEN_TYPE_TCELL_EXPECTATIONS module-attribute

ANTIGEN_TYPE_TCELL_EXPECTATIONS = {'short_peptide': 'CD8', 'long_peptide': 'mixed', 'peptide_pool': 'mixed', 'minigene': 'mixed', 'minigene_library': 'mixed', 'whole_protein': 'mixed', 'mrna': 'mixed', 'tetramer_mhc1': 'CD8', 'tetramer_mhc2': 'CD4', 'sct': 'CD8'}

VALID_ANTIGEN_TYPES module-attribute

VALID_ANTIGEN_TYPES = set(keys())

VALID_SOURCES module-attribute

VALID_SOURCES = {'culture', 'tetramer', 'sct', 'til'}

Sample dataclass

Represents a single sample with its metadata.

Source code in tcrsift/sample_sheet.py
@dataclass
class Sample:
    """Represents a single sample with its metadata."""

    sample: str
    gex_dir: str | None = None
    vdj_dir: str | None = None
    antigen_type: str | None = None
    antigen_description: str | None = None
    culture_days: int | None = None
    tcell_type_expected: str | None = None
    pre_sorted: str | None = None
    mhc_blocking: str | None = None
    source: str = "culture"
    # Antigen fields - what was given to APCs (protein, long peptide, minigene, etc.)
    antigen_name: str | None = None  # name of source antigen (e.g., "PRAME", "CMV pp65")
    antigen_sequence: str | None = None  # sequence of source antigen (may be long)
    # Epitope fields - minimal peptide that binds MHC
    epitope_sequence: str | None = None  # minimal peptide AA sequence (e.g., "SLLQHLIGL")
    mhc_allele: str | None = None  # MHC restriction (e.g., "HLA-A*02:01")
    # Pool/library fields - lists for multiple antigens
    antigen_names: list | None = None  # list of source antigen names
    antigen_sequences: list | None = None  # list of source antigen sequences
    epitope_sequences: list | None = None  # list of minimal epitope sequences (if known)
    # Other metadata
    tissue: str | None = None
    patient_id: str | None = None
    # Experiment grouping for multi-source unification
    experiment: str | None = None  # experiment name for unification (e.g., "TIL", "Culture", "SCT")
    # SCT-specific fields
    sct_path: str | None = None  # path to SCT Excel file
    sct_sheet: str = "Cell"  # sheet name in SCT Excel file
    # Standalone GEX file (for augmentation without full CellRanger output)
    gex_path: str | None = None  # path to 10x filtered_feature_bc_matrix.h5 file
    # TIL-specific input fields (alternative to vdj_dir for TIL samples)
    til_csv: str | None = None  # path to CSV with CDR3_alpha, CDR3_beta columns
    til_h5ad: str | None = None  # path to pre-processed h5ad file

    def __post_init__(self):
        # Validate at least one data source
        has_cellranger = self.gex_dir or self.vdj_dir
        has_sct = self.sct_path is not None
        has_til_data = self.til_csv is not None or self.til_h5ad is not None
        if not has_cellranger and not has_sct and not has_til_data:
            raise ValueError(
                f"Sample '{self.sample}' must have at least gex_dir, vdj_dir, sct_path, til_csv, or til_h5ad"
            )

        # Validate antigen type
        if self.antigen_type and self.antigen_type not in VALID_ANTIGEN_TYPES:
            raise ValueError(
                f"Invalid antigen_type '{self.antigen_type}' for sample '{self.sample}'. "
                f"Valid types: {VALID_ANTIGEN_TYPES}"
            )

        # Validate source
        if self.source not in VALID_SOURCES:
            raise ValueError(
                f"Invalid source '{self.source}' for sample '{self.sample}'. "
                f"Valid sources: {VALID_SOURCES}"
            )

        # Validate tcell_type_expected
        if self.tcell_type_expected and self.tcell_type_expected not in VALID_TCELL_TYPES:
            raise ValueError(
                f"Invalid tcell_type_expected '{self.tcell_type_expected}' for sample '{self.sample}'. "
                f"Valid types: {VALID_TCELL_TYPES}"
            )

        # Validate pre_sorted
        if self.pre_sorted and self.pre_sorted not in VALID_PRE_SORTED:
            raise ValueError(
                f"Invalid pre_sorted '{self.pre_sorted}' for sample '{self.sample}'. "
                f"Valid values: {VALID_PRE_SORTED}"
            )

        # Validate mhc_blocking
        if self.mhc_blocking and self.mhc_blocking not in VALID_MHC_BLOCKING:
            raise ValueError(
                f"Invalid mhc_blocking '{self.mhc_blocking}' for sample '{self.sample}'. "
                f"Valid values: {VALID_MHC_BLOCKING}"
            )

    def get_expected_tcell_type(self) -> str | None:
        """
        Determine expected T cell type based on antigen type, blocking, and sorting.

        Returns the most specific expectation available.
        """
        # Direct specification takes priority
        if self.tcell_type_expected:
            return self.tcell_type_expected

        # Pre-sorting is definitive
        if self.pre_sorted:
            return self.pre_sorted

        # MHC blocking tells us what's NOT expected
        if self.mhc_blocking == "MHC-I":
            return "CD4"  # CD8 responses blocked
        if self.mhc_blocking == "MHC-II":
            return "CD8"  # CD4 responses blocked

        # Antigen type gives us expectations
        if self.antigen_type:
            return ANTIGEN_TYPE_TCELL_EXPECTATIONS.get(self.antigen_type)

        return None

    def is_tetramer_or_sct(self) -> bool:
        """Check if this sample is from tetramer or SCT selection."""
        return self.source in {"tetramer", "sct"} or (
            self.antigen_type is not None
            and (self.antigen_type.startswith("tetramer_") or self.antigen_type == "sct")
        )

    def is_til(self) -> bool:
        """Check if this sample is TIL data."""
        return self.source == "til"

    def is_sct_data(self) -> bool:
        """Check if this sample is SCT platform data."""
        return self.source == "sct" or self.sct_path is not None

    def get_til_data_source(self) -> tuple[str, str] | None:
        """
        Get the TIL data source type and path.

        Returns
        -------
        tuple[str, str] | None
            Tuple of (source_type, path) where source_type is one of:
            - "h5ad": Pre-processed AnnData file
            - "csv": CSV file with CDR3_alpha, CDR3_beta columns
            - "vdj_dir": CellRanger VDJ directory
            Returns None if this is not a TIL sample or has no data source.
        """
        if not self.is_til():
            return None

        if self.til_h5ad:
            return ("h5ad", self.til_h5ad)
        elif self.til_csv:
            return ("csv", self.til_csv)
        elif self.vdj_dir:
            return ("vdj_dir", self.vdj_dir)
        return None

get_expected_tcell_type

get_expected_tcell_type() -> str | None

Determine expected T cell type based on antigen type, blocking, and sorting.

Returns the most specific expectation available.

Source code in tcrsift/sample_sheet.py
def get_expected_tcell_type(self) -> str | None:
    """
    Determine expected T cell type based on antigen type, blocking, and sorting.

    Returns the most specific expectation available.
    """
    # Direct specification takes priority
    if self.tcell_type_expected:
        return self.tcell_type_expected

    # Pre-sorting is definitive
    if self.pre_sorted:
        return self.pre_sorted

    # MHC blocking tells us what's NOT expected
    if self.mhc_blocking == "MHC-I":
        return "CD4"  # CD8 responses blocked
    if self.mhc_blocking == "MHC-II":
        return "CD8"  # CD4 responses blocked

    # Antigen type gives us expectations
    if self.antigen_type:
        return ANTIGEN_TYPE_TCELL_EXPECTATIONS.get(self.antigen_type)

    return None

is_tetramer_or_sct

is_tetramer_or_sct() -> bool

Check if this sample is from tetramer or SCT selection.

Source code in tcrsift/sample_sheet.py
def is_tetramer_or_sct(self) -> bool:
    """Check if this sample is from tetramer or SCT selection."""
    return self.source in {"tetramer", "sct"} or (
        self.antigen_type is not None
        and (self.antigen_type.startswith("tetramer_") or self.antigen_type == "sct")
    )

is_til

is_til() -> bool

Check if this sample is TIL data.

Source code in tcrsift/sample_sheet.py
def is_til(self) -> bool:
    """Check if this sample is TIL data."""
    return self.source == "til"

is_sct_data

is_sct_data() -> bool

Check if this sample is SCT platform data.

Source code in tcrsift/sample_sheet.py
def is_sct_data(self) -> bool:
    """Check if this sample is SCT platform data."""
    return self.source == "sct" or self.sct_path is not None

get_til_data_source

get_til_data_source() -> tuple[str, str] | None

Get the TIL data source type and path.

Returns:

Type Description
tuple[str, str] | None

Tuple of (source_type, path) where source_type is one of: - "h5ad": Pre-processed AnnData file - "csv": CSV file with CDR3_alpha, CDR3_beta columns - "vdj_dir": CellRanger VDJ directory Returns None if this is not a TIL sample or has no data source.

Source code in tcrsift/sample_sheet.py
def get_til_data_source(self) -> tuple[str, str] | None:
    """
    Get the TIL data source type and path.

    Returns
    -------
    tuple[str, str] | None
        Tuple of (source_type, path) where source_type is one of:
        - "h5ad": Pre-processed AnnData file
        - "csv": CSV file with CDR3_alpha, CDR3_beta columns
        - "vdj_dir": CellRanger VDJ directory
        Returns None if this is not a TIL sample or has no data source.
    """
    if not self.is_til():
        return None

    if self.til_h5ad:
        return ("h5ad", self.til_h5ad)
    elif self.til_csv:
        return ("csv", self.til_csv)
    elif self.vdj_dir:
        return ("vdj_dir", self.vdj_dir)
    return None

SampleSheet dataclass

Collection of samples with their metadata.

Source code in tcrsift/sample_sheet.py
@dataclass
class SampleSheet:
    """Collection of samples with their metadata."""

    samples: list[Sample] = field(default_factory=list)

    def __len__(self):
        return len(self.samples)

    def __iter__(self):
        return iter(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

    def get_sample(self, name: str) -> Sample | None:
        """Get a sample by name."""
        for s in self.samples:
            if s.sample == name:
                return s
        return None

    def get_culture_samples(self) -> list[Sample]:
        """Get all culture samples (not TIL, tetramer, or SCT)."""
        return [s for s in self.samples if s.source == "culture"]

    def get_til_samples(self) -> list[Sample]:
        """Get all TIL samples."""
        return [s for s in self.samples if s.is_til()]

    def get_tetramer_samples(self) -> list[Sample]:
        """Get all tetramer/SCT samples."""
        return [s for s in self.samples if s.is_tetramer_or_sct()]

    def get_sct_samples(self) -> list[Sample]:
        """Get all SCT platform samples."""
        return [s for s in self.samples if s.is_sct_data()]

    def get_samples_by_experiment(self) -> dict[str, list[Sample]]:
        """
        Group samples by experiment name.

        Returns a dictionary mapping experiment names to lists of samples.
        Samples without an experiment field are grouped under 'default'.
        """
        groups: dict[str, list[Sample]] = {}
        for s in self.samples:
            exp_name = s.experiment or "default"
            if exp_name not in groups:
                groups[exp_name] = []
            groups[exp_name].append(s)
        return groups

    def get_experiment_names(self) -> list[str]:
        """Get unique experiment names, preserving order."""
        seen = set()
        names = []
        for s in self.samples:
            exp_name = s.experiment or "default"
            if exp_name not in seen:
                seen.add(exp_name)
                names.append(exp_name)
        return names

    def to_dataframe(self) -> pd.DataFrame:
        """Convert sample sheet to a pandas DataFrame."""
        records = []
        for s in self.samples:
            record = {
                "sample": s.sample,
                "gex_dir": s.gex_dir,
                "vdj_dir": s.vdj_dir,
                "gex_path": s.gex_path,
                "sct_path": s.sct_path,
                "sct_sheet": s.sct_sheet,
                "experiment": s.experiment,
                "antigen_type": s.antigen_type,
                "antigen_description": s.antigen_description,
                "antigen_name": s.antigen_name,
                "epitope_sequence": s.epitope_sequence,
                "mhc_allele": s.mhc_allele,
                "culture_days": s.culture_days,
                "tcell_type_expected": s.tcell_type_expected,
                "pre_sorted": s.pre_sorted,
                "mhc_blocking": s.mhc_blocking,
                "source": s.source,
                "expected_tcell_type": s.get_expected_tcell_type(),
            }
            records.append(record)
        return pd.DataFrame(records)

get_sample

get_sample(name: str) -> Sample | None

Get a sample by name.

Source code in tcrsift/sample_sheet.py
def get_sample(self, name: str) -> Sample | None:
    """Get a sample by name."""
    for s in self.samples:
        if s.sample == name:
            return s
    return None

get_culture_samples

get_culture_samples() -> list[Sample]

Get all culture samples (not TIL, tetramer, or SCT).

Source code in tcrsift/sample_sheet.py
def get_culture_samples(self) -> list[Sample]:
    """Get all culture samples (not TIL, tetramer, or SCT)."""
    return [s for s in self.samples if s.source == "culture"]

get_til_samples

get_til_samples() -> list[Sample]

Get all TIL samples.

Source code in tcrsift/sample_sheet.py
def get_til_samples(self) -> list[Sample]:
    """Get all TIL samples."""
    return [s for s in self.samples if s.is_til()]

get_tetramer_samples

get_tetramer_samples() -> list[Sample]

Get all tetramer/SCT samples.

Source code in tcrsift/sample_sheet.py
def get_tetramer_samples(self) -> list[Sample]:
    """Get all tetramer/SCT samples."""
    return [s for s in self.samples if s.is_tetramer_or_sct()]

get_sct_samples

get_sct_samples() -> list[Sample]

Get all SCT platform samples.

Source code in tcrsift/sample_sheet.py
def get_sct_samples(self) -> list[Sample]:
    """Get all SCT platform samples."""
    return [s for s in self.samples if s.is_sct_data()]

get_samples_by_experiment

get_samples_by_experiment() -> dict[str, list[Sample]]

Group samples by experiment name.

Returns a dictionary mapping experiment names to lists of samples. Samples without an experiment field are grouped under 'default'.

Source code in tcrsift/sample_sheet.py
def get_samples_by_experiment(self) -> dict[str, list[Sample]]:
    """
    Group samples by experiment name.

    Returns a dictionary mapping experiment names to lists of samples.
    Samples without an experiment field are grouped under 'default'.
    """
    groups: dict[str, list[Sample]] = {}
    for s in self.samples:
        exp_name = s.experiment or "default"
        if exp_name not in groups:
            groups[exp_name] = []
        groups[exp_name].append(s)
    return groups

get_experiment_names

get_experiment_names() -> list[str]

Get unique experiment names, preserving order.

Source code in tcrsift/sample_sheet.py
def get_experiment_names(self) -> list[str]:
    """Get unique experiment names, preserving order."""
    seen = set()
    names = []
    for s in self.samples:
        exp_name = s.experiment or "default"
        if exp_name not in seen:
            seen.add(exp_name)
            names.append(exp_name)
    return names

to_dataframe

to_dataframe() -> pd.DataFrame

Convert sample sheet to a pandas DataFrame.

Source code in tcrsift/sample_sheet.py
def to_dataframe(self) -> pd.DataFrame:
    """Convert sample sheet to a pandas DataFrame."""
    records = []
    for s in self.samples:
        record = {
            "sample": s.sample,
            "gex_dir": s.gex_dir,
            "vdj_dir": s.vdj_dir,
            "gex_path": s.gex_path,
            "sct_path": s.sct_path,
            "sct_sheet": s.sct_sheet,
            "experiment": s.experiment,
            "antigen_type": s.antigen_type,
            "antigen_description": s.antigen_description,
            "antigen_name": s.antigen_name,
            "epitope_sequence": s.epitope_sequence,
            "mhc_allele": s.mhc_allele,
            "culture_days": s.culture_days,
            "tcell_type_expected": s.tcell_type_expected,
            "pre_sorted": s.pre_sorted,
            "mhc_blocking": s.mhc_blocking,
            "source": s.source,
            "expected_tcell_type": s.get_expected_tcell_type(),
        }
        records.append(record)
    return pd.DataFrame(records)

load_sample_sheet

load_sample_sheet(path: str | Path) -> SampleSheet

Load a sample sheet from CSV or YAML file.

Parameters:

Name Type Description Default
path str or Path

Path to the sample sheet file (.csv, .tsv, .yaml, or .yml)

required

Returns:

Type Description
SampleSheet

Parsed sample sheet with all samples

Source code in tcrsift/sample_sheet.py
def load_sample_sheet(path: str | Path) -> SampleSheet:
    """
    Load a sample sheet from CSV or YAML file.

    Parameters
    ----------
    path : str or Path
        Path to the sample sheet file (.csv, .tsv, .yaml, or .yml)

    Returns
    -------
    SampleSheet
        Parsed sample sheet with all samples
    """
    path = Path(path)

    if not path.exists():
        raise FileNotFoundError(f"Sample sheet not found: {path}")

    suffix = path.suffix.lower()

    if suffix in {".yaml", ".yml"}:
        return _load_yaml_sample_sheet(path)
    elif suffix in {".csv", ".tsv"}:
        return _load_csv_sample_sheet(path, sep="," if suffix == ".csv" else "\t")
    else:
        raise ValueError(
            f"Unsupported sample sheet format: {suffix}. Use .csv, .tsv, .yaml, or .yml"
        )

validate_sample_sheet

validate_sample_sheet(sample_sheet: SampleSheet) -> list[str]

Validate a sample sheet and return any warnings.

Parameters:

Name Type Description Default
sample_sheet SampleSheet

The sample sheet to validate

required

Returns:

Type Description
list[str]

List of warning messages (empty if no warnings)

Source code in tcrsift/sample_sheet.py
def validate_sample_sheet(sample_sheet: SampleSheet) -> list[str]:
    """
    Validate a sample sheet and return any warnings.

    Parameters
    ----------
    sample_sheet : SampleSheet
        The sample sheet to validate

    Returns
    -------
    list[str]
        List of warning messages (empty if no warnings)
    """
    warnings = []

    # Check for duplicate sample names
    names = [s.sample for s in sample_sheet.samples]
    if len(names) != len(set(names)):
        duplicates = [n for n in names if names.count(n) > 1]
        warnings.append(f"Duplicate sample names found: {set(duplicates)}")

    # Check for samples with conflicting expectations
    for sample in sample_sheet.samples:
        expected = sample.get_expected_tcell_type()

        # Warn if long peptide but expecting CD8
        if sample.antigen_type == "long_peptide" and expected == "CD8":
            warnings.append(
                f"Sample '{sample.sample}': Long peptides typically favor CD4+ responses, "
                "but CD8+ is expected. Consider if this is correct."
            )

        # Warn if short peptide but expecting CD4
        if sample.antigen_type == "short_peptide" and expected == "CD4":
            warnings.append(
                f"Sample '{sample.sample}': Short peptides typically bind MHC-I for CD8+ responses, "
                "but CD4+ is expected. Consider if this is correct."
            )

        # Check for path existence
        if sample.gex_dir and not Path(sample.gex_dir).exists():
            warnings.append(f"Sample '{sample.sample}': gex_dir does not exist: {sample.gex_dir}")

        if sample.vdj_dir and not Path(sample.vdj_dir).exists():
            warnings.append(f"Sample '{sample.sample}': vdj_dir does not exist: {sample.vdj_dir}")

        if sample.sct_path and not Path(sample.sct_path).exists():
            warnings.append(f"Sample '{sample.sample}': sct_path does not exist: {sample.sct_path}")

        if sample.gex_path and not Path(sample.gex_path).exists():
            warnings.append(f"Sample '{sample.sample}': gex_path does not exist: {sample.gex_path}")

        if sample.til_csv and not Path(sample.til_csv).exists():
            warnings.append(f"Sample '{sample.sample}': til_csv does not exist: {sample.til_csv}")

        if sample.til_h5ad and not Path(sample.til_h5ad).exists():
            warnings.append(f"Sample '{sample.sample}': til_h5ad does not exist: {sample.til_h5ad}")

    return warnings