TIL Matching API¶

Module for matching clonotypes against tumor-infiltrating lymphocyte data.

til ¶

TIL (Tumor-Infiltrating Lymphocyte) matching for TCRsift.

Identifies culture-validated TCRs in TIL samples. Supports loading TIL data from multiple formats: - h5ad: Pre-processed AnnData files - CSV: Simple CSV with CDR3_alpha, CDR3_beta columns - vdj_dir: CellRanger VDJ output directory

load_til_data ¶

load_til_data(source_type: str, path: str | Path, sample_name: str | None = None) -> pd.DataFrame

Load TIL data from various formats into a standard DataFrame.

Parameters:

Name	Type	Description	Default
`source_type`	`str`	Type of source: "h5ad", "csv", or "vdj_dir"	required
`path`	`str or Path`	Path to the data source	required
`sample_name`	`str`	Name to assign to this sample (used in output)	`None`

Returns:

Type	Description
`DataFrame`	DataFrame with at least CDR3_alpha, CDR3_beta, and sample columns. Each row represents one cell.

Raises:

Type	Description
`TCRsiftValidationError`	If the file doesn't exist or has invalid format

Source code in tcrsift/til.py

def load_til_data(
    source_type: str,
    path: str | Path,
    sample_name: str | None = None,
) -> pd.DataFrame:
    """
    Load TIL data from various formats into a standard DataFrame.

    Parameters
    ----------
    source_type : str
        Type of source: "h5ad", "csv", or "vdj_dir"
    path : str or Path
        Path to the data source
    sample_name : str, optional
        Name to assign to this sample (used in output)

    Returns
    -------
    pd.DataFrame
        DataFrame with at least CDR3_alpha, CDR3_beta, and sample columns.
        Each row represents one cell.

    Raises
    ------
    TCRsiftValidationError
        If the file doesn't exist or has invalid format
    """
    path = Path(path)
    sample_name = sample_name or path.stem

    if source_type == "h5ad":
        return _load_til_from_h5ad(path, sample_name)
    elif source_type == "csv":
        return _load_til_from_csv(path, sample_name)
    elif source_type == "vdj_dir":
        return _load_til_from_vdj_dir(path, sample_name)
    else:
        raise TCRsiftValidationError(
            f"Unknown TIL source type: {source_type}",
            hint="Valid types are: h5ad, csv, vdj_dir",
        )

load_til_samples ¶

load_til_samples(samples: list[Sample] | SampleSheet) -> dict[str, pd.DataFrame]

Load TIL data from multiple samples.

Parameters:

Name	Type	Description	Default
`samples`	`list[Sample] or SampleSheet`	Samples to load (only TIL samples will be processed)	required

Returns:

Type	Description
`dict[str, DataFrame]`	Dictionary mapping sample names to DataFrames

Source code in tcrsift/til.py

def load_til_samples(
    samples: list[Sample] | SampleSheet,
) -> dict[str, pd.DataFrame]:
    """
    Load TIL data from multiple samples.

    Parameters
    ----------
    samples : list[Sample] or SampleSheet
        Samples to load (only TIL samples will be processed)

    Returns
    -------
    dict[str, pd.DataFrame]
        Dictionary mapping sample names to DataFrames
    """
    if isinstance(samples, SampleSheet):
        samples = samples.get_til_samples()
    else:
        samples = [s for s in samples if s.is_til()]

    til_data = {}
    for sample in samples:
        source = sample.get_til_data_source()
        if source is None:
            logger.warning(f"TIL sample '{sample.sample}' has no data source, skipping")
            continue

        source_type, source_path = source
        df = load_til_data(source_type, source_path, sample.sample)

        # Add tissue info if available
        if sample.tissue and "tissue" not in df.columns:
            df["tissue"] = sample.tissue

        til_data[sample.sample] = df

    return til_data

load_til_specs ¶

load_til_specs(specs: list[str]) -> dict[str, pd.DataFrame]

Load TIL data from repeatable --til-sample specs.

Spec format: - NAME=TYPE:PATH - TYPE:PATH (sample name inferred from PATH stem)

Example: - T1=csv:/path/to/til_t1.csv - T2=vdj:/path/to/cellranger_vdj_outs

Source code in tcrsift/til.py

def load_til_specs(
    specs: list[str],
) -> dict[str, pd.DataFrame]:
    """
    Load TIL data from repeatable --til-sample specs.

    Spec format:
    - NAME=TYPE:PATH
    - TYPE:PATH (sample name inferred from PATH stem)

    Example:
    - T1=csv:/path/to/til_t1.csv
    - T2=vdj:/path/to/cellranger_vdj_outs
    """
    til_data: dict[str, pd.DataFrame] = {}

    for spec in specs:
        sample_name, source_type, source_path = parse_til_sample_spec(spec)
        if sample_name in til_data:
            raise TCRsiftValidationError(
                f"Duplicate TIL sample name in --til-sample specs: '{sample_name}'",
                hint="Use unique sample names for each --til-sample entry.",
            )
        til_data[sample_name] = load_til_data(source_type, source_path, sample_name)

    return til_data

summarize_til_clonotypes ¶

summarize_til_clonotypes(til_data: AnnData | DataFrame | dict[str, DataFrame], match_by: str = 'CDR3ab', min_cells: int = 1) -> pd.DataFrame

Summarize TIL-only data into clonotype-level counts/frequencies across samples.

Parameters:

Name	Type	Description	Default
`til_data`	`AnnData, DataFrame, or dict[str, DataFrame]`	TIL data source(s)	required
`match_by`	`str`	"CDR3ab" (alpha+beta) or "CDR3b_only" (beta only)	`'CDR3ab'`
`min_cells`	`int`	Minimum total cells across all TIL samples to retain a clonotype	`1`

Returns:

Type	Description
`DataFrame`	One row per clonotype with total/per-sample TIL counts and frequencies.

Source code in tcrsift/til.py

def summarize_til_clonotypes(
    til_data: ad.AnnData | pd.DataFrame | dict[str, pd.DataFrame],
    match_by: str = "CDR3ab",
    min_cells: int = 1,
) -> pd.DataFrame:
    """
    Summarize TIL-only data into clonotype-level counts/frequencies across samples.

    Parameters
    ----------
    til_data : AnnData, DataFrame, or dict[str, DataFrame]
        TIL data source(s)
    match_by : str
        "CDR3ab" (alpha+beta) or "CDR3b_only" (beta only)
    min_cells : int
        Minimum total cells across all TIL samples to retain a clonotype

    Returns
    -------
    pd.DataFrame
        One row per clonotype with total/per-sample TIL counts and frequencies.
    """
    if match_by not in {"CDR3ab", "CDR3b_only"}:
        raise TCRsiftValidationError(
            f"Invalid match_by: '{match_by}'",
            hint="Use one of: CDR3ab, CDR3b_only",
        )

    if min_cells < 1:
        raise TCRsiftValidationError(
            f"min_cells must be >= 1, got {min_cells}",
            hint="Use min_cells=1 to keep all detected TIL clonotypes.",
        )

    # Normalize til_data to dict format
    if isinstance(til_data, ad.AnnData):
        til_dict = {"TIL": til_data.obs.copy()}
    elif isinstance(til_data, pd.DataFrame):
        til_dict = {"TIL": til_data.copy()}
    elif isinstance(til_data, dict):
        til_dict = {k: v.copy() for k, v in til_data.items()}
    else:
        raise TypeError(f"til_data must be AnnData, DataFrame, or dict, got {type(til_data)}")

    sample_stats = {}
    for sample_name, df in til_dict.items():
        if match_by == "CDR3ab":
            clone_key = (
                df.get("CDR3_alpha", pd.Series("", index=df.index)).fillna("")
                + "_"
                + df.get("CDR3_beta", pd.Series("", index=df.index)).fillna("")
            )
        else:
            clone_key = df.get("CDR3_beta", pd.Series("", index=df.index)).fillna("")

        clone_counts = clone_key.value_counts().to_dict()
        valid_mask = (clone_key != "_") & (clone_key != "")
        total_cells = int(valid_mask.sum())

        sample_stats[sample_name] = {
            "clone_counts": clone_counts,
            "total_cells": total_cells,
        }

    total_til_cells = sum(stats["total_cells"] for stats in sample_stats.values())
    all_clones = sorted(
        {
            clone
            for stats in sample_stats.values()
            for clone in stats["clone_counts"].keys()
            if clone not in {"", "_"}
        }
    )

    rows = []
    for clone in all_clones:
        row = {
            "CDR3ab": clone,
            "til_samples": "",
            "til_cell_count": 0,
            "til_frequency": 0.0,
        }

        if match_by == "CDR3ab":
            parts = clone.split("_", 1)
            row["CDR3_alpha"] = parts[0] if len(parts) > 0 else ""
            row["CDR3_beta"] = parts[1] if len(parts) > 1 else ""
        else:
            row["CDR3_beta"] = clone

        sample_hits = []
        total_count = 0
        for sample_name, stats in sample_stats.items():
            count = int(stats["clone_counts"].get(clone, 0))
            row[f"til_cell_count.{sample_name}"] = count
            row[f"til_frequency.{sample_name}"] = safe_divide(
                count, stats["total_cells"], default=0.0
            )
            if count > 0:
                sample_hits.append(sample_name)
                total_count += count

        if total_count < min_cells:
            continue

        row["til_samples"] = ",".join(sample_hits)
        row["til_cell_count"] = total_count
        row["til_frequency"] = safe_divide(total_count, total_til_cells, default=0.0)
        row["n_til_samples"] = len(sample_hits)
        rows.append(row)

    result = pd.DataFrame(rows)
    if len(result) == 0:
        return result

    return result.sort_values("til_cell_count", ascending=False).reset_index(drop=True)

match_til ¶

match_til(culture_clonotypes: DataFrame, til_data: AnnData | DataFrame | dict[str, DataFrame], match_by: str = 'CDR3ab', min_til_cells: int = 1) -> pd.DataFrame

Match culture-validated clonotypes against TIL data.

Supports single TIL sample (AnnData or DataFrame) or multiple TIL samples (dict mapping sample names to DataFrames).

Parameters:

Name	Type	Description	Default
`culture_clonotypes`	`DataFrame`	Filtered clonotypes from culture experiments	required
`til_data`	`AnnData, DataFrame, or dict[str, DataFrame]`	TIL data with TCR information. Can be: - AnnData: Single TIL sample (uses .obs) - DataFrame: Single TIL sample with CDR3_alpha, CDR3_beta columns - dict: Multiple TIL samples mapping name -> DataFrame	required
`match_by`	`str`	Matching strategy: "CDR3ab" or "CDR3b_only"	`'CDR3ab'`
`min_til_cells`	`int`	Minimum TIL cells to count as present (per sample for multi-sample)	`1`

Returns:

Type	Description
`DataFrame`	Culture clonotypes with TIL match information: - til_match: Found in any TIL sample - til_samples: Comma-separated list of matching TIL samples - til_cell_count: Total cells across all TIL samples - til_frequency: Combined frequency - til_cell_count.{sample}: Cells in specific sample (multi-sample only) - til_frequency.{sample}: Frequency in specific sample (multi-sample only)

Source code in tcrsift/til.py

def match_til(
    culture_clonotypes: pd.DataFrame,
    til_data: ad.AnnData | pd.DataFrame | dict[str, pd.DataFrame],
    match_by: str = "CDR3ab",
    min_til_cells: int = 1,
) -> pd.DataFrame:
    """
    Match culture-validated clonotypes against TIL data.

    Supports single TIL sample (AnnData or DataFrame) or multiple TIL samples
    (dict mapping sample names to DataFrames).

    Parameters
    ----------
    culture_clonotypes : pd.DataFrame
        Filtered clonotypes from culture experiments
    til_data : AnnData, DataFrame, or dict[str, DataFrame]
        TIL data with TCR information. Can be:
        - AnnData: Single TIL sample (uses .obs)
        - DataFrame: Single TIL sample with CDR3_alpha, CDR3_beta columns
        - dict: Multiple TIL samples mapping name -> DataFrame
    match_by : str
        Matching strategy: "CDR3ab" or "CDR3b_only"
    min_til_cells : int
        Minimum TIL cells to count as present (per sample for multi-sample)

    Returns
    -------
    pd.DataFrame
        Culture clonotypes with TIL match information:
        - til_match: Found in any TIL sample
        - til_samples: Comma-separated list of matching TIL samples
        - til_cell_count: Total cells across all TIL samples
        - til_frequency: Combined frequency
        - til_cell_count.{sample}: Cells in specific sample (multi-sample only)
        - til_frequency.{sample}: Frequency in specific sample (multi-sample only)
    """
    # Normalize til_data to dict format
    if isinstance(til_data, ad.AnnData):
        til_dict = {"TIL": til_data.obs.copy()}
    elif isinstance(til_data, pd.DataFrame):
        til_dict = {"TIL": til_data.copy()}
    elif isinstance(til_data, dict):
        til_dict = {k: v.copy() for k, v in til_data.items()}
    else:
        raise TypeError(f"til_data must be AnnData, DataFrame, or dict, got {type(til_data)}")

    n_samples = len(til_dict)
    logger.info(
        f"Matching {len(culture_clonotypes)} culture clonotypes against {n_samples} TIL sample(s)"
    )

    df = culture_clonotypes.copy()

    # Process each TIL sample
    sample_stats = {}
    for sample_name, til_df in til_dict.items():
        # Build CDR3ab identifier
        if match_by == "CDR3ab":
            til_df["CDR3ab"] = (
                til_df.get("CDR3_alpha", pd.Series("", index=til_df.index)).fillna("")
                + "_"
                + til_df.get("CDR3_beta", pd.Series("", index=til_df.index)).fillna("")
            )
        else:
            til_df["CDR3ab"] = til_df.get("CDR3_beta", pd.Series("", index=til_df.index)).fillna("")

        # Count cells per clone
        # Exclude both "_" (empty alpha+beta) and "" (empty string) from counts
        clone_counts = til_df["CDR3ab"].value_counts().to_dict()
        valid_cdr3_mask = (til_df["CDR3ab"] != "_") & (til_df["CDR3ab"] != "")
        total_til = len(til_df[valid_cdr3_mask])

        sample_stats[sample_name] = {
            "clone_counts": clone_counts,
            "total_til": total_til,
        }

    # Initialize combined columns
    df["til_match"] = False
    df["til_samples"] = ""
    df["til_cell_count"] = 0
    df["til_frequency"] = 0.0

    # Initialize per-sample columns if multiple samples
    if n_samples > 1:
        for sample_name in til_dict.keys():
            df[f"til_cell_count.{sample_name}"] = 0
            df[f"til_frequency.{sample_name}"] = 0.0

    # Total TIL cells across all samples (for combined frequency)
    total_til_all = sum(s["total_til"] for s in sample_stats.values())

    # Match each culture clone
    for idx, row in df.iterrows():
        if match_by == "CDR3ab":
            cdr3ab = row.get("CDR3ab", "")
        else:
            cdr3ab = row.get("CDR3_beta", "")

        if not cdr3ab:
            continue

        matching_samples = []
        total_count = 0

        for sample_name, stats in sample_stats.items():
            if cdr3ab in stats["clone_counts"]:
                count = stats["clone_counts"][cdr3ab]
                if count >= min_til_cells:
                    matching_samples.append(sample_name)
                    total_count += count

                    # Per-sample stats
                    if n_samples > 1:
                        df.loc[idx, f"til_cell_count.{sample_name}"] = count
                        df.loc[idx, f"til_frequency.{sample_name}"] = safe_divide(
                            count, stats["total_til"], default=0.0
                        )

        if matching_samples:
            df.loc[idx, "til_match"] = True
            df.loc[idx, "til_samples"] = ",".join(matching_samples)
            df.loc[idx, "til_cell_count"] = total_count
            df.loc[idx, "til_frequency"] = safe_divide(total_count, total_til_all, default=0.0)

    n_matches = df["til_match"].sum()
    match_pct = safe_percentage(n_matches, len(df), default=0.0)
    logger.info(f"Found {n_matches} culture clonotypes present in TILs ({match_pct:.1f}%)")

    if n_samples > 1:
        for sample_name in til_dict.keys():
            sample_matches = (df[f"til_cell_count.{sample_name}"] > 0).sum()
            logger.info(f"  - {sample_name}: {sample_matches} matches")

    return df

get_til_summary ¶

get_til_summary(matched_clonotypes: DataFrame) -> dict

Get summary of TIL matching results.

Returns:

Type	Description
`dict`	Summary statistics

Source code in tcrsift/til.py

def get_til_summary(
    matched_clonotypes: pd.DataFrame,
) -> dict:
    """
    Get summary of TIL matching results.

    Returns
    -------
    dict
        Summary statistics
    """
    if "til_match" not in matched_clonotypes.columns:
        return {"error": "No TIL match information"}

    matched = matched_clonotypes[matched_clonotypes["til_match"]]

    summary = {
        "total_culture_clones": len(matched_clonotypes),
        "til_matched_clones": len(matched),
        "til_recovery_rate": safe_divide(len(matched), len(matched_clonotypes), default=0.0),
        "total_til_cells_matched": matched["til_cell_count"].sum(),
        "median_til_frequency": matched["til_frequency"].median() if len(matched) > 0 else 0,
    }

    # By tier if available
    if "tier" in matched_clonotypes.columns:
        tier_recovery = {}
        for tier in matched_clonotypes["tier"].unique():
            if tier is not None:
                tier_df = matched_clonotypes[matched_clonotypes["tier"] == tier]
                tier_matched = tier_df["til_match"].sum()
                tier_recovery[tier] = safe_divide(tier_matched, len(tier_df), default=0.0)
        summary["recovery_by_tier"] = tier_recovery

    # By antigen if available
    if "antigens" in matched.columns:
        antigen_recovery = matched.groupby("antigens")["til_cell_count"].sum().to_dict()
        summary["til_cells_by_antigen"] = antigen_recovery

    return summary

identify_til_specific_clones ¶

identify_til_specific_clones(til_data: AnnData, culture_clonotypes: DataFrame | None = None, min_cells: int = 2) -> pd.DataFrame

Identify clones that are abundant in TILs but not in culture.

These could be tumor-reactive TCRs not captured in the culture system.

Parameters:

Name	Type	Description	Default
`til_data`	`AnnData`	TIL data	required
`culture_clonotypes`	`DataFrame`	Culture clonotypes to exclude	`None`
`min_cells`	`int`	Minimum cells in TIL to consider	`2`

Returns:

Type	Description
`DataFrame`	TIL-specific clones

Source code in tcrsift/til.py

def identify_til_specific_clones(
    til_data: ad.AnnData,
    culture_clonotypes: pd.DataFrame | None = None,
    min_cells: int = 2,
) -> pd.DataFrame:
    """
    Identify clones that are abundant in TILs but not in culture.

    These could be tumor-reactive TCRs not captured in the culture system.

    Parameters
    ----------
    til_data : ad.AnnData
        TIL data
    culture_clonotypes : pd.DataFrame, optional
        Culture clonotypes to exclude
    min_cells : int
        Minimum cells in TIL to consider

    Returns
    -------
    pd.DataFrame
        TIL-specific clones
    """
    til_df = til_data.obs.copy()

    # Build clone identifier
    til_df["CDR3ab"] = (
        til_df.get("CDR3_alpha", pd.Series("", index=til_df.index)).fillna("")
        + "_"
        + til_df.get("CDR3_beta", pd.Series("", index=til_df.index)).fillna("")
    )

    # Aggregate TIL clones
    til_clones = (
        til_df.groupby("CDR3ab")
        .agg(
            {
                "sample": "first",
            }
        )
        .reset_index()
    )

    til_clones["til_cell_count"] = til_df.groupby("CDR3ab").size().values
    til_clones = til_clones[til_clones["til_cell_count"] >= min_cells]

    # Extract CDR3 sequences
    til_clones["CDR3_alpha"] = til_clones["CDR3ab"].str.split("_").str[0]
    til_clones["CDR3_beta"] = til_clones["CDR3ab"].str.split("_").str[1]

    # Filter out culture clones if provided
    if culture_clonotypes is not None:
        culture_ids = set(culture_clonotypes["CDR3ab"].values)
        til_clones = til_clones[~til_clones["CDR3ab"].isin(culture_ids)]
        logger.info(f"Found {len(til_clones)} TIL-specific clones not in culture")
    else:
        logger.info(f"Found {len(til_clones)} expanded TIL clones")

    return til_clones