Source code for zoom.prepare

# -*- coding: utf-8 -*-
"""
Functionality for perparing neccessary data for ZOOM.
- Regional AHBA processing pipeline optimized for cortical samples.
- Regional SBP and spatial permutation test.
- scRNA-seq preprocess and calculate gene specificity scores.
"""

import numpy as np
import pandas as pd
import nibabel as nib
import abagen
import os
from abagen import io
from abagen.samples_ import _get_struct
from abagen.correct import keep_stable_genes
from neuromaps.datasets import fetch_atlas
from neuromaps.images import load_data
from neuromaps.nulls.spins import (gen_spinsamples, get_parcel_centroids,
                                   load_spins, spin_data, spin_parcels)
from typing import Union, Tuple
from data_loader import (get_vertex_num, load_gii, load_parc)


[docs]
def abagen_ctx(
    atlas: Union[nib.Nifti1Image, Tuple, dict],
    atlas_info: Union[pd.DataFrame, os.PathLike],
    data_dir: os.PathLike,
    donors_threshold: int,
    gene_stability_threshold: float
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    """
    AHBA processing pipeline optimized for cortical samples.

    Parameters
    ----------
    atlas : nibabel.Nifti1Image, tuple or dict
        Brain atlas specification. Can be:
        - A parcellation image in MNI space.
        - A tuple of GIFTI images in fsaverage5 space.
        - a dictionary where keys are donor IDs and values are parcellation 
          images (or surfaces) in the native space of each donor.
    atlas_info : pd.DataFrame or os.PathLike
        Filepath to or pre-loaded dataframe containing information about atlas,
        see `abagen` document.
    data_dir : os.PathLike
        Directory where expression data should be downloaded (if it does not 
        already exist) / loaded.
    donors_threshold : int
        Minimum number of donors required for a region to be included.
    gene_stability_threshold : float
        Threshold for filtering genes based on differential stability across donors.

    Returns
    -------
    expression : pd.DataFrame
        Processed regional AHBA expression.
    stability : pd.DataFrame
        Gene-level differential stability.

    References
    ----------
    [1] Markello, R. D., Arnatkeviciute, A., Poline, J. B., Fulcher, B. D., 
        Fornito, A. & Misic, B. Standardizing workflows in imaging transcriptomics 
        with the abagen toolbox. Elife 10, e72129 (2021).
    [2] Dear, R. et al. Cortical gene expression architecture links healthy 
        neurodevelopment to the imaging, transcriptomics and genetics of autism and 
        schizophrenia. Nat. Neurosci. 27, 1075-1086 (2024).
    """
    
    # Patch `drop_mismatch_samples` function to keep only left cortical samples before probe selection
    def drop_mismatch_samples_and_filter(annotation, ontology):
        annotation = io.read_annotation(annotation)
        ontology = io.read_ontology(ontology).set_index('id')
        sid = np.asarray(annotation['structure_id'])
        # get hemisphere and structure path
        hemisphere = np.asarray(ontology.loc[sid, 'hemisphere']
                                        .replace({np.nan: 'B'}))
        structure = np.asarray(ontology.loc[sid, 'structure_id_path']
                                       .apply(_get_struct))
        # add hemisphere + brain "structure" designation to annotation data and
        # only keep samples with consistent hemisphere + MNI coordinate designation
        annot = annotation.assign(hemisphere=hemisphere, structure=structure) \
                          .query('(hemisphere == "L" & mni_x < 0) '
                                 '| (hemisphere == "R" & mni_x > 0) '
                                 '| (hemisphere == "B" & mni_x == 0)',
                                 engine='python')
        annot = annot.copy().query("structure == 'cortex'")
        annot = annot.copy().query("hemisphere == 'L'")
        return annot

    ### Replace original function in abagen package with patched function ###
    abagen.samples_.drop_mismatch_samples = drop_mismatch_samples_and_filter
    
    # Get donor-wised regional AHBA expression with modified abagen
    expression, counts, report = abagen.allen.get_expression_data(
        atlas = atlas, atlas_info = atlas_info, data_dir = data_dir,
        n_proc = 6, return_donors = True, return_counts = True, 
        return_report = True, verbose = 1, lr_mirror = 'rightleft'
    )
    expression = list(expression.values())
    
    # Find regions with at least 1 sample from at least `donors_threshold` donors
    region_filter = (counts > 0).sum(axis=1) >= donors_threshold
    # Filter all donor expression matrices for those regions
    expression = [e.loc[region_filter, :] for e in expression]
    
    # Remember gene labels to use as index
    gene_labels = expression[0].columns
    # Filter all donor expression matrices for stable genes
    expression, stability = keep_stable_genes(
        expression, threshold = gene_stability_threshold,
        return_stability = True
    )
    stability = pd.Series(stability, index=gene_labels)
    
    # Aggregate final expression matrix
    expression = pd.concat(expression).groupby('label').mean()
    stability = pd.DataFrame(stability, columns=["DS"])
    return expression, stability



[docs]
def fetch_medial_wall(
    atlas: str,
    density: str,
    hemi: str
) -> list:
    
    """
    Fetch indices of medial wall vertices.

    Parameters
    ----------
    atlas: {'fsaverage', 'fSLR', 'civet'} optional,
        Name of surface atlas on which `parcellation` is defined.
    density : str, optional
        Density of surface mesh on which `parcellation` is defined. Must becompatible 
        with specified `atlas`.
    hemi: {'L','lh','left','both'} optional,
        Hemisphere used to perform downstream analyses.

    Returns
    -------
    medial_wall : list
        Indices of medial wall vertices.
    """
    
    vertex_num = get_vertex_num(atlas, density, hemi='L')
    # For development stage
    pwd = "/slurm/home/yrd/liaolab/nieshuyang/ZOOM/data/medial_wall"
    
    # Fetch medial wall indices
    if hemi in ['L','lh','left']:        
        filename = f"{pwd}/lh.medial_wall.{atlas}.{density}.txt"
        medial_wall = list(np.loadtxt(filename).astype(np.int16))
    elif hemi=='both':
        lh_filename = f"{pwd}/lh.medial_wall.{atlas}.{density}.txt"
        rh_filename = f"{pwd}/rh.medial_wall.{atlas}.{density}.txt"
        lh_medial_wall = list(np.loadtxt(lh_filename).astype(np.int16))
        rh_medial_wall = list(np.loadtxt(rh_filename).astype(np.int16))
        rh_medial_wall = list(map(lambda x: x+vertex_num,rh_medial_wall))
        medial_wall = lh_medial_wall+rh_medial_wall
    return medial_wall

    

[docs]
def process_SBP(
    SBP: Union[os.PathLike, tuple, nib.gifti.gifti.GiftiImage],
    parcellation: Union[os.PathLike, tuple, nib.gifti.gifti.GiftiImage],
    atlas: str, density: str, hemi: str, n_perm: int, seed: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    """
    Assign SBP image to parcellation image and perform spatial permutation test.

    Parameters
    ----------
    SBP : nibabel.Nifti1Image, tuple or dict
        Brain atlas specification. Can be:
        - A parcellation image in MNI space.
        - A tuple of GIFTI images in fsaverage5 space.
        - a dictionary where keys are donor IDs and values are parcellation 
          images (or surfaces) in the native space of each donor.
    parcellation : path_like str, tuple, nib.gifti.gifti.GiftiImage
        Recognized parcellation image
        - If hemi in ['L','lh','left']:
            - path_like str: Filepaths to .gii or .annot
            - nib.gifti.gifti.GiftiImage
        - Otherwise, use both hemisphere:
            - tuple: must be organized as L/R pair
                - (str, str): Filepaths to .gii or .annot
                - (nib.gifti.gifti.GiftiImage, nib.gifti.gifti.GiftiImage)
    atlas: {'fsaverage', 'fSLR', 'civet'} optional,
        Name of surface atlas on which `parcellation` is defined.
    density : str, optional
        Density of surface mesh on which `parcellation` is defined. Must becompatible 
        with specified `atlas`.
    hemi: {'L','lh','left','both'} optional,
        Hemisphere used to perform downstream analyses.
    n_perm : int
        Number of permuation test to perform.
    seed : int
        Random seed of spatial permutation test.

    Returns
    -------
    df_SBP_parc : pd.DataFrame
        SBP values in parcellation labels.
    df_surr_parc : pd.DataFrame
        Permutated SBP values in parcellation labels.

    References
    ----------
    [1] Markello, R. D. et al. Neuromaps: structural and functional interpretation 
        of brain maps. Nat. Methods 19, 1472-1479 (2022)..
    [2] Alexander-Bloch, A. F. et al. On testing for spatial correspondence between 
        maps of human brain structure and function. Neuroimage 178, 540-551 (2018).
    """
    
    # Fetch medial wall and vertex number
    medial_wall = fetch_medial_wall(atlas, density, hemi)
    vertex_num = get_vertex_num(atlas, density, hemi)
    
    # Generate surrogates utilizing Alexander-Bloch's spatial permutation test
    surfaces = fetch_atlas(atlas, density)['sphere']
    coords, hemis = get_parcel_centroids(surfaces, method='surface')
    coords = coords[:vertex_num, :]
    coords = np.delete(coords, medial_wall, axis=0)
    hemis = hemis[:vertex_num]
    hemis = np.delete(hemis, medial_wall)
    spins = gen_spinsamples(coords, hemis, n_rotate=n_perm, seed=seed)
    spins = load_spins(spins)
    
    # Load SBP and parcellation
    SBP_data = load_gii(SBP, atlas, density, hemi)
    SBP_data[medial_wall] = np.full((len(medial_wall),), np.nan)
    SBP_data_valid = np.delete(SBP_data, medial_wall)
    parc_data = load_parc(parcellation, atlas, density, hemi)
    parc_labels = list(np.unique(parc_data).astype(np.int16))
    
    # Assign SBP into parcellation image
    SBP_parc = np.full((len(parc_labels),), np.nan)
    for i in range(len(parc_labels)):
        label = parc_labels[i]
        label_idx = np.where(parc_data == label)[0]
        label_values = SBP_data[label_idx]
        SBP_parc[i] = np.mean(label_values)
    df_SBP_parc = pd.DataFrame(SBP_parc, index=parc_labels, 
                               columns=['SBP'])
    
    # Project permutated SBP onto standard surface
    surrogates = load_data(SBP_data_valid)[spins]
    surr_surf = np.full((vertex_num, n_perm), np.nan)
    for i in range(n_perm):
        surr_surf[:,i][~np.isnan(SBP_data)] = surrogates[:,i]
        
    # Assign permutated SBP into parcellation image
    surr_parc = np.full((len(parc_labels), n_perm), np.nan)
    for p in range(n_perm):
        for i in range(len(parc_labels)):
            label = parc_labels[i]
            label_idx = np.where(parc_data == label)[0]
            label_values = surr_surf[label_idx, p]
            surr_parc[i, p] = np.mean(label_values)
    df_surr_parc = pd.DataFrame(surr_parc, index=parc_labels, 
                                columns=[f'Perm{i+1}' for i in range(n_perm)])
    return df_SBP_parc, df_surr_parc