Source code for zoom.data_loader

# -*- coding: utf-8 -*-
"""
Functionality for loading single-cell RNA sequencing data, surface-
based spatial brain phenotypes, parcellation and data frame.
"""

import scanpy as sc
import anndata as ad
import scipy.sparse as sp
import nibabel as nib
import numpy as np
import pandas as pd
import os
from typing import Union

[docs] def load_sc( adata: Union[os.PathLike,ad.AnnData], flag_sparse: bool ) -> ad.AnnData: """ Load and check scRNA-seq file (.h5ad). Parameters ---------- adata : str or AnnData Path to .h5ad file or AnnData object flag_sparse : bool If True, enforce adata.X to be a sparse matrix Returns ------- adata : ad.AnnData AnnData object containing scRNA-seq data. """ if isinstance(adata, str): adata = sc.read_h5ad(adata) elif isinstance(adata, ad.AnnData): pass else: raise TypeError( f"AnnData must be a filepath (str) or ad.AnnData. Got {type(df)}." ) # Check input: # 1. If there is any NaN value; # 2. If there is any negative value; if np.isnan(adata.X.sum()): raise ValueError( "Single-cell gene expression matrix should not contain any NaN value. Please check your data." ) if (adata.X < 0).sum() > 0: raise ValueError( "Single-cell gene expression matrix should not contain any negative value. Please check your data." ) # Convert gene expression matrix into sparse matrix if flag_sparse: if not sp.issparse(adata.X): adata.X = sp.csr_matrix(adata.X) return adata
# A quick function to grep vertex number for each atlas def get_vertex_num(atlas, density, hemi): atlas_vertex_dict = {'fsLR_32k': 32492, 'fsLR_164k': 163842, 'fsaverage_10k': 10242, 'fsaverage_41k': 40962, 'fsaverage_164k': 163842, 'civet_41k': 40962} if hemi in ['L','lh','left']: return atlas_vertex_dict[f'{atlas}_{density}'] elif hemi == 'both': return atlas_vertex_dict[f'{atlas}_{density}']*2
[docs] def load_gii( gii: Union[os.PathLike, tuple, nib.gifti.gifti.GiftiImage], atlas: str, density: str, hemi: str ) -> np.ndarray: """ Load GIFTI image. Parameters ---------- gii : path_like str, tuple or nib.gifti.gifti.GiftiImage Recognized GIFTI image - If hemi in ['L','lh','left']: - path_like str: Filepaths to .gii - nib.gifti.gifti.GiftiImage - Otherwise, use both hemisphere: - tuple: must be organized as L/R pair - (str, str): Filepaths to .gii - (GiftiImage, GiftiImage) atlas : {'fsaverage', 'fSLR'} optional, Name of surface atlas on which `gii` is defined. density : str, optional Density of surface mesh on which `gii` is defined. Must becompatible with specified `atlas`. hemi : {'L','lh','left','both'} optional, Hemisphere used to perform downstream analyses. Returns ------- gii_data : np.ndarray Data extracted from given GIFTI image. """ # Fetch expected vertex number for given space vertex_num = get_vertex_num(atlas, density, hemi) # Load GIFTI image for left hemisphere analysis if hemi in ['L','lh','left']: if isinstance(gii, str): gii_img = nib.load(gii) elif isinstance(gii, nib.gifti.gifti.GiftiImage): gii_img = gii else: raise TypeError( f"For left hemisphere data, GIFTI image must be a filepath (str) or GiftiImage. Got {type(gii)}." ) gii_data = gii_img.darrays[0].data # Load GIFTI image for both hemisphere analysis elif hemi == 'both': gii_data_lr = [] if isinstance(gii, tuple): if not gii or len(gii) != 2: # Assume tuples are for L/R pair raise ValueError("Tuple must contain exactly two elements (for L and R).") # Determine type of elements in tuple str_tuple = all(isinstance(p, str) and p.endswith('.gii') for p in gii) gii_tuple = all(isinstance(p, nib.gifti.gifti.GiftiImage) for p in gii) if str_tuple: lh_gii_img = nib.load(gii[0]) rh_gii_img = nib.load(gii[1]) gii_data_lr.append(lh_gii_img.darrays[0].data) gii_data_lr.append(rh_gii_img.darrays[1].data) elif gii_tuple: gii_data_lr.append(gii[0].darrays[0].data) gii_data_lr.append(gii[1].darrays[1].data) else: raise TypeError( f"For both hemisphere data, GIFTI image must be a tuple containg filepaths or GIFTI images. Got {type(gii[0])} and {type(gii[1])}." ) else: raise TypeError( f"For both hemisphere data, GIFTI image must be a tuple containg filepaths or GIFTI images. Got {type(gii)}." ) gii_data = np.concatenate(gii_data_lr) if gii_data.shape[0] != vertex_num: raise ValueError(f"For the {atlas} {density} ({hemi} hemisphere) standard space, the expected number of vertices is {vertex_num}, whereas the actual number obtained is {gii_data.shape[0]}.") return gii_data
[docs] def load_parc( parcellation: Union[os.PathLike, tuple, nib.gifti.gifti.GiftiImage], atlas: str, density: str, hemi: str ) -> np.ndarray: """ Load parcellation image. Parameters ---------- parcellation : path_like str, tuple, nib.gifti.gifti.GiftiImage Recognized parcellation image - If hemi in ['L','lh','left']: - path_like str: Filepaths to .gii or .annot - nib.gifti.gifti.GiftiImage - Otherwise, use both hemisphere: - tuple: must be organized as L/R pair - (str, str): Filepaths to .gii or .annot - (nib.gifti.gifti.GiftiImage, nib.gifti.gifti.GiftiImage) atlas: {'fsaverage', 'fSLR', 'civet'} optional, Name of surface atlas on which `parcellation` is defined. density : str, optional Density of surface mesh on which `parcellation` is defined. Must becompatible with specified `atlas`. hemi: {'L','lh','left','both'} optional, Hemisphere used to perform downstream analyses. Returns ------- gii_data : np.ndarray Data extracted from given GIFTI image. """ # Fetch expected vertex number for given space vertex_num = get_vertex_num(atlas, density, hemi) # Load parcellation image for left hemisphere analysis if hemi in ['L','lh','left']: if isinstance(parcellation, str): if parcellation.endswith('.gii'): parc_data = load_gii(parcellation,atlas,density,hemi) elif parcellation.endswith('.annot'): parc_data, _, _ = nib.freesurfer.io.read_annot(parcellation) else: raise ValueError( f"Filepath must end with .gii or .annot. Got: {parcellation}" ) elif isinstance(parcellation, nib.gifti.gifti.GiftiImage): parc_data = load_gii(parcellation,atlas,density,hemi) else: raise TypeError( f"For left hemisphere, parcellation image must be a filepath (str) or GiftiImage. Got {type(parcellation)}." ) # Load parcellation image for both hemisphere analysis elif hemi == 'both': if isinstance(parcellation, tuple): if not parcellation or len(parcellation) != 2: # Assume tuples are for L/R pair raise ValueError("Tuple must contain exactly two elements (for L and R).") # Determine type of elements in tuple str_gii_tuple = all(isinstance(p, str) and p.endswith('.gii') for p in parcellation) str_annot_tuple = all(isinstance(p, str) and p.endswith('.annot') for p in parcellation) gii_tuple = all(isinstance(p, nib.gifti.gifti.GiftiImage) for p in parcellation) if str_gii_tuple or gii_tuple: parc_data = load_gii(parcellation,atlas,density,hemi) elif str_annot_tuple: lh_parc_data, _, _ = nib.freesurfer.io.read_annot(parcellation[0]) rh_parc_data, _, _ = nib.freesurfer.io.read_annot(parcellation[1]) parc_data = np.concatenate([lh_parc_data,rh_parc_data]) else: raise TypeError( f"For both hemisphere data, parcellation image must be a tuple containg filepaths ending with '.gii' or '.annot', or GIFTI images. Got {type(gii[0])} and {type(gii[1])}." ) else: raise TypeError( f"For both hemisphere, parcellation image must be a tuple containg filepaths or GIFTI images. Got {type(parcellation)}." ) if parc_data.shape[0] != vertex_num: raise ValueError(f"For the {atlas} {density} ({hemi} hemisphere) standard space, the expected number of vertices is {vertex_num}, whereas the actual number obtained is {parc_data.shape[0]}.") return parc_data
[docs] def load_df(df: Union[os.PathLike, pd.DataFrame]) -> pd.DataFrame: """ Load parcellation image. Parameters ---------- file : path_like str, pd.DataFrame Recognized data frame file - path_like str: Filepath to .csv, .tsv - pd.DataFrame Returns ------- df : pd.DataFrame Data frame for analysis. """ if isinstance(df, str): if df.endswith('.csv'): df = pd.read_csv(df, index_col=0) elif df.endswith('.tsv'): df = pd.read_csv(df, sep='\t', index_col=0) else: raise ValueError( f"Filepath must end with .csv or .tsv. Got: {df}" ) elif isinstance(df, pd.DataFrame): pass else: raise TypeError( f"DataFrame must be a filepath (str) or pd.DataFrame. Got {type(df)}." ) return df