# -*- coding: utf-8 -*-
"""
Functionality for perparing neccessary data for ZOOM.
- Regional AHBA processing pipeline optimized for cortical samples.
- Regional SBP and spatial permutation test.
- scRNA-seq preprocess and calculate gene specificity scores.
"""
import numpy as np
import pandas as pd
import nibabel as nib
import abagen
import os
from abagen import io
from abagen.samples_ import _get_struct
from abagen.correct import keep_stable_genes
from neuromaps.datasets import fetch_atlas
from neuromaps.images import load_data
from neuromaps.nulls.spins import (gen_spinsamples, get_parcel_centroids,
load_spins, spin_data, spin_parcels)
from typing import Union, Tuple
from data_loader import (get_vertex_num, load_gii, load_parc)
[docs]
def abagen_ctx(
atlas: Union[nib.Nifti1Image, Tuple, dict],
atlas_info: Union[pd.DataFrame, os.PathLike],
data_dir: os.PathLike,
donors_threshold: int,
gene_stability_threshold: float
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
AHBA processing pipeline optimized for cortical samples.
Parameters
----------
atlas : nibabel.Nifti1Image, tuple or dict
Brain atlas specification. Can be:
- A parcellation image in MNI space.
- A tuple of GIFTI images in fsaverage5 space.
- a dictionary where keys are donor IDs and values are parcellation
images (or surfaces) in the native space of each donor.
atlas_info : pd.DataFrame or os.PathLike
Filepath to or pre-loaded dataframe containing information about atlas,
see `abagen` document.
data_dir : os.PathLike
Directory where expression data should be downloaded (if it does not
already exist) / loaded.
donors_threshold : int
Minimum number of donors required for a region to be included.
gene_stability_threshold : float
Threshold for filtering genes based on differential stability across donors.
Returns
-------
expression : pd.DataFrame
Processed regional AHBA expression.
stability : pd.DataFrame
Gene-level differential stability.
References
----------
[1] Markello, R. D., Arnatkeviciute, A., Poline, J. B., Fulcher, B. D.,
Fornito, A. & Misic, B. Standardizing workflows in imaging transcriptomics
with the abagen toolbox. Elife 10, e72129 (2021).
[2] Dear, R. et al. Cortical gene expression architecture links healthy
neurodevelopment to the imaging, transcriptomics and genetics of autism and
schizophrenia. Nat. Neurosci. 27, 1075-1086 (2024).
"""
# Patch `drop_mismatch_samples` function to keep only left cortical samples before probe selection
def drop_mismatch_samples_and_filter(annotation, ontology):
annotation = io.read_annotation(annotation)
ontology = io.read_ontology(ontology).set_index('id')
sid = np.asarray(annotation['structure_id'])
# get hemisphere and structure path
hemisphere = np.asarray(ontology.loc[sid, 'hemisphere']
.replace({np.nan: 'B'}))
structure = np.asarray(ontology.loc[sid, 'structure_id_path']
.apply(_get_struct))
# add hemisphere + brain "structure" designation to annotation data and
# only keep samples with consistent hemisphere + MNI coordinate designation
annot = annotation.assign(hemisphere=hemisphere, structure=structure) \
.query('(hemisphere == "L" & mni_x < 0) '
'| (hemisphere == "R" & mni_x > 0) '
'| (hemisphere == "B" & mni_x == 0)',
engine='python')
annot = annot.copy().query("structure == 'cortex'")
annot = annot.copy().query("hemisphere == 'L'")
return annot
### Replace original function in abagen package with patched function ###
abagen.samples_.drop_mismatch_samples = drop_mismatch_samples_and_filter
# Get donor-wised regional AHBA expression with modified abagen
expression, counts, report = abagen.allen.get_expression_data(
atlas = atlas, atlas_info = atlas_info, data_dir = data_dir,
n_proc = 6, return_donors = True, return_counts = True,
return_report = True, verbose = 1, lr_mirror = 'rightleft'
)
expression = list(expression.values())
# Find regions with at least 1 sample from at least `donors_threshold` donors
region_filter = (counts > 0).sum(axis=1) >= donors_threshold
# Filter all donor expression matrices for those regions
expression = [e.loc[region_filter, :] for e in expression]
# Remember gene labels to use as index
gene_labels = expression[0].columns
# Filter all donor expression matrices for stable genes
expression, stability = keep_stable_genes(
expression, threshold = gene_stability_threshold,
return_stability = True
)
stability = pd.Series(stability, index=gene_labels)
# Aggregate final expression matrix
expression = pd.concat(expression).groupby('label').mean()
stability = pd.DataFrame(stability, columns=["DS"])
return expression, stability
[docs]
def process_SBP(
SBP: Union[os.PathLike, tuple, nib.gifti.gifti.GiftiImage],
parcellation: Union[os.PathLike, tuple, nib.gifti.gifti.GiftiImage],
atlas: str, density: str, hemi: str, n_perm: int, seed: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Assign SBP image to parcellation image and perform spatial permutation test.
Parameters
----------
SBP : nibabel.Nifti1Image, tuple or dict
Brain atlas specification. Can be:
- A parcellation image in MNI space.
- A tuple of GIFTI images in fsaverage5 space.
- a dictionary where keys are donor IDs and values are parcellation
images (or surfaces) in the native space of each donor.
parcellation : path_like str, tuple, nib.gifti.gifti.GiftiImage
Recognized parcellation image
- If hemi in ['L','lh','left']:
- path_like str: Filepaths to .gii or .annot
- nib.gifti.gifti.GiftiImage
- Otherwise, use both hemisphere:
- tuple: must be organized as L/R pair
- (str, str): Filepaths to .gii or .annot
- (nib.gifti.gifti.GiftiImage, nib.gifti.gifti.GiftiImage)
atlas: {'fsaverage', 'fSLR', 'civet'} optional,
Name of surface atlas on which `parcellation` is defined.
density : str, optional
Density of surface mesh on which `parcellation` is defined. Must becompatible
with specified `atlas`.
hemi: {'L','lh','left','both'} optional,
Hemisphere used to perform downstream analyses.
n_perm : int
Number of permuation test to perform.
seed : int
Random seed of spatial permutation test.
Returns
-------
df_SBP_parc : pd.DataFrame
SBP values in parcellation labels.
df_surr_parc : pd.DataFrame
Permutated SBP values in parcellation labels.
References
----------
[1] Markello, R. D. et al. Neuromaps: structural and functional interpretation
of brain maps. Nat. Methods 19, 1472-1479 (2022)..
[2] Alexander-Bloch, A. F. et al. On testing for spatial correspondence between
maps of human brain structure and function. Neuroimage 178, 540-551 (2018).
"""
# Fetch medial wall and vertex number
medial_wall = fetch_medial_wall(atlas, density, hemi)
vertex_num = get_vertex_num(atlas, density, hemi)
# Generate surrogates utilizing Alexander-Bloch's spatial permutation test
surfaces = fetch_atlas(atlas, density)['sphere']
coords, hemis = get_parcel_centroids(surfaces, method='surface')
coords = coords[:vertex_num, :]
coords = np.delete(coords, medial_wall, axis=0)
hemis = hemis[:vertex_num]
hemis = np.delete(hemis, medial_wall)
spins = gen_spinsamples(coords, hemis, n_rotate=n_perm, seed=seed)
spins = load_spins(spins)
# Load SBP and parcellation
SBP_data = load_gii(SBP, atlas, density, hemi)
SBP_data[medial_wall] = np.full((len(medial_wall),), np.nan)
SBP_data_valid = np.delete(SBP_data, medial_wall)
parc_data = load_parc(parcellation, atlas, density, hemi)
parc_labels = list(np.unique(parc_data).astype(np.int16))
# Assign SBP into parcellation image
SBP_parc = np.full((len(parc_labels),), np.nan)
for i in range(len(parc_labels)):
label = parc_labels[i]
label_idx = np.where(parc_data == label)[0]
label_values = SBP_data[label_idx]
SBP_parc[i] = np.mean(label_values)
df_SBP_parc = pd.DataFrame(SBP_parc, index=parc_labels,
columns=['SBP'])
# Project permutated SBP onto standard surface
surrogates = load_data(SBP_data_valid)[spins]
surr_surf = np.full((vertex_num, n_perm), np.nan)
for i in range(n_perm):
surr_surf[:,i][~np.isnan(SBP_data)] = surrogates[:,i]
# Assign permutated SBP into parcellation image
surr_parc = np.full((len(parc_labels), n_perm), np.nan)
for p in range(n_perm):
for i in range(len(parc_labels)):
label = parc_labels[i]
label_idx = np.where(parc_data == label)[0]
label_values = surr_surf[label_idx, p]
surr_parc[i, p] = np.mean(label_values)
df_surr_parc = pd.DataFrame(surr_parc, index=parc_labels,
columns=[f'Perm{i+1}' for i in range(n_perm)])
return df_SBP_parc, df_surr_parc