String Functions Docmuentation

This page describes the functions in scanpy_wrappers.py and their usage.
Functions here default to showing their source code for greater transparency and interoperability with base scanpy functions.

ScanpyMetaObject

Object that collects standardized collections of scanpy functions.

You can access the specific scanpy data in this object using ScanpyMetaObject.adata.
Use this variable like you would use adata in any scanpy operation.

Parameters:

Name Type Description Default
matrix GxcFile | ExcFile

BioFile object of the matrix file.

required
sampledict SampleDict

a SampleDict object from the BioFileDocket.

required
Source code in utils/scanpy_wrappers.py
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
class ScanpyMetaObject():
    """Object that collects standardized collections of scanpy functions.

    You can access the specific scanpy data in this object using `ScanpyMetaObject.adata`.  
    Use this variable like you would use `adata` in any scanpy operation.  

    Args:
        matrix (GxcFile | ExcFile): BioFile object of the matrix file.
        sampledict (SampleDict): a SampleDict object from the BioFileDocket.
    """
    def __init__(self, matrix, sampledict: SampleDict):

        self.matrix = matrix
        self.sampledict = sampledict
        self.matrixname = self.matrix.filename
        self.species = self.sampledict.species
        self.directory = self.sampledict.directory
        self.conditions = self.sampledict.conditions

    @property
    def species_prefix(self):
        if '_' not in self.species:
            return self.species
        else:
            return prefixify(self.species)

    @property
    def matrixpath(self):
        return self.directory + self.matrixname

    @property
    def datatype(self):
        return str(type(self.matrix)).split('.')[1].split('File')[0]

    @property
    def resultsfile(self):
        return self.matrixname + '_scresults.h5ad'

    @property
    def resultspath(self):
        return self.directory + self.resultsfile

    def read(self, delimiter = '\t', cache = True, transpose = True, filter_set = set()):
        """Reads the contained matrix into a scanpy.adata object, transposing if needed.

        Args:
            delimiter (str): a delimeter for the data, such as ',' or '\t'. Defaults to '\t'.
            cache (bool): whether or not to create a cache file for the matrix. Defaults to True.
            transpose (bool): whether or not to transpose the data matrix after loading. Defaults to True.
            filter_set (set): a set of gene ids to keep from the data, removing those that don't match.  
                Transposes matrix after filtering if transpose == True.
        """
        self.adata = sc.read(self.matrixpath, cache = cache, delimiter = delimiter)

        if len(filter_set) != 0:
            self.adata = self.adata[self.adata.obs.index.isin(filter_set)].copy()

        if transpose:
            self.adata = self.adata.transpose()

    def violin(self, x = 'total_counts', y = 'n_genes_by_counts', plot = True):
        """Runs sc.pp.calculate_qc_metrics, sc.pl.violin, and sc.pl.scatter.

        Args:
            x (str): label of the feature to be plotted on the x axis of the violin & scatter plot.  
                Defaults to 'total_counts'.
            y (str): label of the feature to be plotted on the y axis of the violin & scatter plot.
                Defaults to 'n_genes_by_counts'.
        """
        sc.pp.calculate_qc_metrics(self.adata, percent_top=None, log1p=False, inplace=True)

        if plot:
            sc.pl.violin(self.adata, [y, x],
             jitter=0.4, multi_panel=True)
            sc.pl.scatter(self.adata, x=x, y=y)

    def cellgene_filter(self, min_genes=100, min_cells=20):
        """Filters data by minimum number of genes expressed per cell and minimum expressing cells per gene.

        Args:
            min_genes (int): minimum number of genes expressed per cell as a cutoff.
            min_cells (int): minimum number of cells expressed per gene as a cutoff.
        """
        sc.pp.filter_cells(self.adata, min_genes=min_genes)
        sc.pp.filter_genes(self.adata, min_cells=min_cells)


    def normalize(self, max_n_genes_by_counts = 7000, target_sum = 1e4):
        """Normalizes data using a max_n_genes_by_counts cutoff and a target sum.

        Modifies underlying `adata` using cutoff and normalization.

        Args:
            max_n_genes_by_counts (int): removes cells that exceed this number of counds.
            target_sum (float): target number of counts per cell, using scientific notation.
        """
        self.adata = self.adata[self.adata.obs.n_genes_by_counts < max_n_genes_by_counts, :]
        sc.pp.normalize_total(self.adata, target_sum = target_sum)
        sc.pp.log1p(self.adata)

    def variable_filter(self, min_mean = 0.0125, max_mean = 3, min_disp = 0.1, max_disp = 10, plot = True):
        """Filters genes to use for dimensionality reduction using max/min of mean and max/min of dispersion, plotting optionally.

        Modifies underlying `adata` and saves original data under `adata.raw`.

        Args:
            min_mean (float | int): minimum mean expression of genes.
            max_mean (float | int): maximum mean expression of genes.
            min_disp (float | int): minimum dispersion of gene expression.
            max_disp (float | int): maximum dispersion of gene expression.
            plot (bool): whether or not to generate a scatter plot showing cutoffs.
        """

        sc.pp.highly_variable_genes(self.adata, min_mean = min_mean, max_mean=max_mean, min_disp=min_disp, max_disp=max_disp)
        if plot:
            sc.pl.highly_variable_genes(self.adata)

        self.adata.raw = self.adata
        self.adata = self.adata[:, self.adata.var.highly_variable]

    def regress_scale(self, how = ['total_counts'], max_value = 10):
        """Runs regression of specific features and scales data to a maximum value.

        Args:
            how (list): feature to regress out. Defaults to 'total_counts'. 
            max_value (float | int): maximum value to scale data.
        """

        sc.pp.regress_out(self.adata, how)
        sc.pp.scale(self.adata, max_value = max_value)

    def map_cellannots(self, cellannot):
        """Adds an additional .obs feature for cell annotation.

        Imports from a file that has two columns: `cell_barcode` and `cell_type`.  
        The `cell_barcode` field should be an exact match for cells in the data.  
        Cells without a matching barcode are given the `celltype` label `'Unlabeled'`.

        Args:
            cellannot (CellAnnotFile): CellAnnotFile object, usually in the same BioFileDocket.
        """

        cell_ids = pd.DataFrame({'cell_barcode':self.adata.obs.index})
        cell_annots = pd.read_csv(cellannot.path, sep = '\t')
        cell_ids = cell_ids.merge(cell_annots, on = 'cell_barcode', how = 'left')
        cell_ids.fillna('Unlabeled', inplace = True)
        self.adata.obs['celltype'] = cell_ids['celltype'].values

    def map_cellannots_multispecies(self, msd):
        """Adds an additional .obs feature for cell annotation from a MultiSpeciesBioFileDocket.

        Imports from a file that has two columns: `cell_barcode` and `cell_type`.  
        The `cell_barcode` field should be an exact match for cells in the data.  
        Cells without a matching barcode are given the `celltype` label `'Unlabeled'`.

        Args:
            msd (MultiSpeciesBioFileDocket): the docket containing information about files from all species in the dataset.  
                The function looks for the BioFile object at the key `cellannot` for each species.  
        """

        cell_ids = pd.DataFrame({'cell_barcode':self.adata.obs.index})

        cell_annots = pd.DataFrame()

        for i, pre in enumerate(msd.species_BioFileDockets):
            dummy = pd.read_csv(msd.species_BioFileDockets[pre].cellannot.path, sep = '\t')
            dummy['cell_barcode'] = pre + '_' + dummy['cell_barcode']
            dummy['celltype'] = pre + '_' + dummy['celltype']

            if i == 0:
                cell_annots = dummy
            else:
                cell_annots = pd.concat([cell_annots, dummy])

        cell_ids = cell_ids.merge(cell_annots, on = 'cell_barcode', how = 'left')
        cell_ids.fillna('Unlabeled', inplace = True)
        self.adata.obs['celltype'] = cell_ids['celltype'].values

    def pca_basic(self, svd_solver='arpack', color = [], plot = True):
        """Runs a simple PCA, displaying the first two components and variance plot.

        Args:
            svd_solver (str): SVD sovling function, passed to `sc.tl.pca()`.
            color (list): list of coloring schemes for points in the data; creates one plot per color scheme.
            plot (bool): whether or not to make a plot.
        """
        sc.tl.pca(self.adata, svd_solver=svd_solver)

        if not plot:
            return None

        if color != []:
            sc.pl.pca(self.adata, color = color, save = self.species_prefix + self.datatype + '_pca.pdf')
        else:
            sc.pl.pca(self.adata, save = self.species_prefix + self.datatype + '_pca.pdf')
            sc.pl.pca_variance_ratio(self.adata, log=True)

    def umap_leiden(self, n_neighbors=50, n_pcs = 40, legend_loc='on data', save = True, plot = True):
        """Runs Leiden clustering, followed by UMAP, on the data.

        Args:
            n_neighbors (int): number of neighbors to pass to `sc.pp.neighbors`.
            n_pcs (int): number of Principal Components to use from PCA, passed to `sc.pp.neighbors`.
            legend_loc (str): where the legend should go (e.g. `'on data'`.)
            save (bool): whether to save the plot.
            plot (bool): whether to make the plot.
        """
        sc.pp.neighbors(self.adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
        sc.tl.umap(self.adata)
        sc.tl.leiden(self.adata)

        if not plot:
            return None

        if save:
            sc.pl.umap(self.adata, color=['leiden'], legend_loc=legend_loc, save = '_'.join([self.species_prefix, self.datatype, 'leiden.pdf']))
        else:
            sc.pl.umap(self.adata, color=['leiden'], legend_loc=legend_loc)

    def rank_genes(self, on = 'leiden', method = 't-test', plot = True, n_genes = 25, sharey = False):
        """Runs `sc.tl.rank_genes_groups` based on passed parameters.

        Parameters can be changed to alter the ranking scheme.

        Args:
            on (str): clustering feature to use. Defaults to 'leiden'.
            method (str): how to compare groups for gene ranking. Defaults to 't-test'.
            plot (bool): whether or not to plot the data.
            n_genes (int): number of genes to plot.
            sharey (bool): pass to `sc.pl.rank_genes_groups`.
        """

        sc.tl.rank_genes_groups(self.adata, on, method=method)
        sc.pl.rank_genes_groups(self.adata, n_genes=n_genes, sharey=sharey)
        self.adata.write(self.resultsfile)

    def get_top_genes(self, datatype: str, top_number = 200, tofile = True):
        """Get the top_number genes per cluster across all clusters without repetition.  

        Also get the top marker gene per cluster.  
        Save these values as object attributes with a special key to the parent object.

        Args:
            datatype (str): descriptor for the data type, to be included in the outfile name.  
                Note: This is not checked for corretness!
            top_number (int): number of top genes to pull out from each cluster.
            tofile (bool): whether to save a file. (Not implemented yet.)

        Returns:
            (tuple[str, str]): keys of the top gene list files.
        """
        from biofile_handling import GeneListFile

        self.marker_genes_df = pd.DataFrame(self.adata.uns['rank_genes_groups']['names'])
        top_genelist = list(set(chain.from_iterable([i for i in self.marker_genes_df.iloc[0:top_number].values.tolist()])))
        top_marker_genes = list(set(self.marker_genes_df.iloc[0].values))

        genelisttype = '_'.join(['top', str(top_number) + 'DE', datatype, 'list'])
        markergenetype = '_'.join(['top', 'marker', datatype, 'list'])
        setattr(self, genelisttype, top_genelist)
        setattr(self, markergenetype, top_marker_genes)

        return genelisttype, markergenetype

    def export_top_genes(self, key: str):
        """Exports the top genes list to a text file, assigning the object as an attribute of the parent object.

        Args:
            key (str): a key for the gene list, ending in `'_list'`.

        Raises:
            Exception: when the key does not end in `'_list'`.
        """
        if '_list' not in key:
            raise Exception('key must be an attribute ending in "_list"')

        top_type = key.split('_')[1]
        datatype = key.split('_')[2]

        top_genelist_filename = '_'.join([self.species_prefix, self.datatype, 'top', top_type, datatype, 'ids.txt'])
        top_genelist_file = GeneListFile(filename = top_genelist_filename, 
                                         sampledict = self.sampledict, 
                                         sources = self.matrix, 
                                         genes = getattr(self, key), 
                                         identifier = datatype)
        setattr(self, key + '_file', top_genelist_file)

    # Using an IdmmFile object, convert IDs from one type to another
    # Usually you'll be converting from gene_name to the embedding (e.g. Orthogroup)
    # Prints a table of the mappings
    # Returns a list of ids for the ones actually present in the scanpy object
    def map_gene_to_id(self, idmm, gene_list: list, from_id: str, to_id: str, check_ids = True):
        """Maps IDs from one type into another using an IdmmFile object.

        Args:
            idmm (IdmmFile): the IdmmFile object you're converting IDs between.
            gene_list (list of str): list of genes in the dataset you want to convert.
            from_id (str): starting feature column in the idmm.
            to_id (str): ending feature in the idmm.
            check_ids (bool): whether to check if the ids in gene_list are present in the parent object.  
                If true, only returns mappings that are actually found in the parent data.
        """

        import pandas as pd
        idmm = pd.read_csv(idmm.path, sep = '\t')

        id_table = idmm[idmm[from_id].isin(gene_list)][[from_id, to_id]].drop_duplicates()
        display(id_table)

        if check_ids:
            ids = [i for i in id_table[to_id].values if i in list(self.adata.var.index)]
        else:
            ids = [i for i in id_table[to_id].values]

        return ids

cellgene_filter(min_genes=100, min_cells=20)

Filters data by minimum number of genes expressed per cell and minimum expressing cells per gene.

Parameters:

Name Type Description Default
min_genes int

minimum number of genes expressed per cell as a cutoff.

100
min_cells int

minimum number of cells expressed per gene as a cutoff.

20
Source code in utils/scanpy_wrappers.py
84
85
86
87
88
89
90
91
92
def cellgene_filter(self, min_genes=100, min_cells=20):
    """Filters data by minimum number of genes expressed per cell and minimum expressing cells per gene.

    Args:
        min_genes (int): minimum number of genes expressed per cell as a cutoff.
        min_cells (int): minimum number of cells expressed per gene as a cutoff.
    """
    sc.pp.filter_cells(self.adata, min_genes=min_genes)
    sc.pp.filter_genes(self.adata, min_cells=min_cells)

export_top_genes(key)

Exports the top genes list to a text file, assigning the object as an attribute of the parent object.

Parameters:

Name Type Description Default
key str

a key for the gene list, ending in '_list'.

required

Raises:

Type Description
Exception

when the key does not end in '_list'.

Source code in utils/scanpy_wrappers.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def export_top_genes(self, key: str):
    """Exports the top genes list to a text file, assigning the object as an attribute of the parent object.

    Args:
        key (str): a key for the gene list, ending in `'_list'`.

    Raises:
        Exception: when the key does not end in `'_list'`.
    """
    if '_list' not in key:
        raise Exception('key must be an attribute ending in "_list"')

    top_type = key.split('_')[1]
    datatype = key.split('_')[2]

    top_genelist_filename = '_'.join([self.species_prefix, self.datatype, 'top', top_type, datatype, 'ids.txt'])
    top_genelist_file = GeneListFile(filename = top_genelist_filename, 
                                     sampledict = self.sampledict, 
                                     sources = self.matrix, 
                                     genes = getattr(self, key), 
                                     identifier = datatype)
    setattr(self, key + '_file', top_genelist_file)

get_top_genes(datatype, top_number=200, tofile=True)

Get the top_number genes per cluster across all clusters without repetition.

Also get the top marker gene per cluster.
Save these values as object attributes with a special key to the parent object.

Parameters:

Name Type Description Default
datatype str

descriptor for the data type, to be included in the outfile name.
Note: This is not checked for corretness!

required
top_number int

number of top genes to pull out from each cluster.

200
tofile bool

whether to save a file. (Not implemented yet.)

True

Returns:

Type Description
tuple[str, str]

keys of the top gene list files.

Source code in utils/scanpy_wrappers.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def get_top_genes(self, datatype: str, top_number = 200, tofile = True):
    """Get the top_number genes per cluster across all clusters without repetition.  

    Also get the top marker gene per cluster.  
    Save these values as object attributes with a special key to the parent object.

    Args:
        datatype (str): descriptor for the data type, to be included in the outfile name.  
            Note: This is not checked for corretness!
        top_number (int): number of top genes to pull out from each cluster.
        tofile (bool): whether to save a file. (Not implemented yet.)

    Returns:
        (tuple[str, str]): keys of the top gene list files.
    """
    from biofile_handling import GeneListFile

    self.marker_genes_df = pd.DataFrame(self.adata.uns['rank_genes_groups']['names'])
    top_genelist = list(set(chain.from_iterable([i for i in self.marker_genes_df.iloc[0:top_number].values.tolist()])))
    top_marker_genes = list(set(self.marker_genes_df.iloc[0].values))

    genelisttype = '_'.join(['top', str(top_number) + 'DE', datatype, 'list'])
    markergenetype = '_'.join(['top', 'marker', datatype, 'list'])
    setattr(self, genelisttype, top_genelist)
    setattr(self, markergenetype, top_marker_genes)

    return genelisttype, markergenetype

map_cellannots(cellannot)

Adds an additional .obs feature for cell annotation.

Imports from a file that has two columns: cell_barcode and cell_type.
The cell_barcode field should be an exact match for cells in the data.
Cells without a matching barcode are given the celltype label 'Unlabeled'.

Parameters:

Name Type Description Default
cellannot CellAnnotFile

CellAnnotFile object, usually in the same BioFileDocket.

required
Source code in utils/scanpy_wrappers.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def map_cellannots(self, cellannot):
    """Adds an additional .obs feature for cell annotation.

    Imports from a file that has two columns: `cell_barcode` and `cell_type`.  
    The `cell_barcode` field should be an exact match for cells in the data.  
    Cells without a matching barcode are given the `celltype` label `'Unlabeled'`.

    Args:
        cellannot (CellAnnotFile): CellAnnotFile object, usually in the same BioFileDocket.
    """

    cell_ids = pd.DataFrame({'cell_barcode':self.adata.obs.index})
    cell_annots = pd.read_csv(cellannot.path, sep = '\t')
    cell_ids = cell_ids.merge(cell_annots, on = 'cell_barcode', how = 'left')
    cell_ids.fillna('Unlabeled', inplace = True)
    self.adata.obs['celltype'] = cell_ids['celltype'].values

map_cellannots_multispecies(msd)

Adds an additional .obs feature for cell annotation from a MultiSpeciesBioFileDocket.

Imports from a file that has two columns: cell_barcode and cell_type.
The cell_barcode field should be an exact match for cells in the data.
Cells without a matching barcode are given the celltype label 'Unlabeled'.

Parameters:

Name Type Description Default
msd MultiSpeciesBioFileDocket

the docket containing information about files from all species in the dataset.
The function looks for the BioFile object at the key cellannot for each species.

required
Source code in utils/scanpy_wrappers.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def map_cellannots_multispecies(self, msd):
    """Adds an additional .obs feature for cell annotation from a MultiSpeciesBioFileDocket.

    Imports from a file that has two columns: `cell_barcode` and `cell_type`.  
    The `cell_barcode` field should be an exact match for cells in the data.  
    Cells without a matching barcode are given the `celltype` label `'Unlabeled'`.

    Args:
        msd (MultiSpeciesBioFileDocket): the docket containing information about files from all species in the dataset.  
            The function looks for the BioFile object at the key `cellannot` for each species.  
    """

    cell_ids = pd.DataFrame({'cell_barcode':self.adata.obs.index})

    cell_annots = pd.DataFrame()

    for i, pre in enumerate(msd.species_BioFileDockets):
        dummy = pd.read_csv(msd.species_BioFileDockets[pre].cellannot.path, sep = '\t')
        dummy['cell_barcode'] = pre + '_' + dummy['cell_barcode']
        dummy['celltype'] = pre + '_' + dummy['celltype']

        if i == 0:
            cell_annots = dummy
        else:
            cell_annots = pd.concat([cell_annots, dummy])

    cell_ids = cell_ids.merge(cell_annots, on = 'cell_barcode', how = 'left')
    cell_ids.fillna('Unlabeled', inplace = True)
    self.adata.obs['celltype'] = cell_ids['celltype'].values

map_gene_to_id(idmm, gene_list, from_id, to_id, check_ids=True)

Maps IDs from one type into another using an IdmmFile object.

Parameters:

Name Type Description Default
idmm IdmmFile

the IdmmFile object you're converting IDs between.

required
gene_list list of str

list of genes in the dataset you want to convert.

required
from_id str

starting feature column in the idmm.

required
to_id str

ending feature in the idmm.

required
check_ids bool

whether to check if the ids in gene_list are present in the parent object.
If true, only returns mappings that are actually found in the parent data.

True
Source code in utils/scanpy_wrappers.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def map_gene_to_id(self, idmm, gene_list: list, from_id: str, to_id: str, check_ids = True):
    """Maps IDs from one type into another using an IdmmFile object.

    Args:
        idmm (IdmmFile): the IdmmFile object you're converting IDs between.
        gene_list (list of str): list of genes in the dataset you want to convert.
        from_id (str): starting feature column in the idmm.
        to_id (str): ending feature in the idmm.
        check_ids (bool): whether to check if the ids in gene_list are present in the parent object.  
            If true, only returns mappings that are actually found in the parent data.
    """

    import pandas as pd
    idmm = pd.read_csv(idmm.path, sep = '\t')

    id_table = idmm[idmm[from_id].isin(gene_list)][[from_id, to_id]].drop_duplicates()
    display(id_table)

    if check_ids:
        ids = [i for i in id_table[to_id].values if i in list(self.adata.var.index)]
    else:
        ids = [i for i in id_table[to_id].values]

    return ids

normalize(max_n_genes_by_counts=7000, target_sum=10000.0)

Normalizes data using a max_n_genes_by_counts cutoff and a target sum.

Modifies underlying adata using cutoff and normalization.

Parameters:

Name Type Description Default
max_n_genes_by_counts int

removes cells that exceed this number of counds.

7000
target_sum float

target number of counts per cell, using scientific notation.

10000.0
Source code in utils/scanpy_wrappers.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def normalize(self, max_n_genes_by_counts = 7000, target_sum = 1e4):
    """Normalizes data using a max_n_genes_by_counts cutoff and a target sum.

    Modifies underlying `adata` using cutoff and normalization.

    Args:
        max_n_genes_by_counts (int): removes cells that exceed this number of counds.
        target_sum (float): target number of counts per cell, using scientific notation.
    """
    self.adata = self.adata[self.adata.obs.n_genes_by_counts < max_n_genes_by_counts, :]
    sc.pp.normalize_total(self.adata, target_sum = target_sum)
    sc.pp.log1p(self.adata)

pca_basic(svd_solver='arpack', color=[], plot=True)

Runs a simple PCA, displaying the first two components and variance plot.

Parameters:

Name Type Description Default
svd_solver str

SVD sovling function, passed to sc.tl.pca().

'arpack'
color list

list of coloring schemes for points in the data; creates one plot per color scheme.

[]
plot bool

whether or not to make a plot.

True
Source code in utils/scanpy_wrappers.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def pca_basic(self, svd_solver='arpack', color = [], plot = True):
    """Runs a simple PCA, displaying the first two components and variance plot.

    Args:
        svd_solver (str): SVD sovling function, passed to `sc.tl.pca()`.
        color (list): list of coloring schemes for points in the data; creates one plot per color scheme.
        plot (bool): whether or not to make a plot.
    """
    sc.tl.pca(self.adata, svd_solver=svd_solver)

    if not plot:
        return None

    if color != []:
        sc.pl.pca(self.adata, color = color, save = self.species_prefix + self.datatype + '_pca.pdf')
    else:
        sc.pl.pca(self.adata, save = self.species_prefix + self.datatype + '_pca.pdf')
        sc.pl.pca_variance_ratio(self.adata, log=True)

rank_genes(on='leiden', method='t-test', plot=True, n_genes=25, sharey=False)

Runs sc.tl.rank_genes_groups based on passed parameters.

Parameters can be changed to alter the ranking scheme.

Parameters:

Name Type Description Default
on str

clustering feature to use. Defaults to 'leiden'.

'leiden'
method str

how to compare groups for gene ranking. Defaults to 't-test'.

't-test'
plot bool

whether or not to plot the data.

True
n_genes int

number of genes to plot.

25
sharey bool

pass to sc.pl.rank_genes_groups.

False
Source code in utils/scanpy_wrappers.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def rank_genes(self, on = 'leiden', method = 't-test', plot = True, n_genes = 25, sharey = False):
    """Runs `sc.tl.rank_genes_groups` based on passed parameters.

    Parameters can be changed to alter the ranking scheme.

    Args:
        on (str): clustering feature to use. Defaults to 'leiden'.
        method (str): how to compare groups for gene ranking. Defaults to 't-test'.
        plot (bool): whether or not to plot the data.
        n_genes (int): number of genes to plot.
        sharey (bool): pass to `sc.pl.rank_genes_groups`.
    """

    sc.tl.rank_genes_groups(self.adata, on, method=method)
    sc.pl.rank_genes_groups(self.adata, n_genes=n_genes, sharey=sharey)
    self.adata.write(self.resultsfile)

read(delimiter='\t', cache=True, transpose=True, filter_set=set())

Reads the contained matrix into a scanpy.adata object, transposing if needed.

Parameters:

Name Type Description Default
delimiter str

a delimeter for the data, such as ',' or ' '. Defaults to ' '.

'\t'
cache bool

whether or not to create a cache file for the matrix. Defaults to True.

True
transpose bool

whether or not to transpose the data matrix after loading. Defaults to True.

True
filter_set set

a set of gene ids to keep from the data, removing those that don't match.
Transposes matrix after filtering if transpose == True.

set()
Source code in utils/scanpy_wrappers.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def read(self, delimiter = '\t', cache = True, transpose = True, filter_set = set()):
    """Reads the contained matrix into a scanpy.adata object, transposing if needed.

    Args:
        delimiter (str): a delimeter for the data, such as ',' or '\t'. Defaults to '\t'.
        cache (bool): whether or not to create a cache file for the matrix. Defaults to True.
        transpose (bool): whether or not to transpose the data matrix after loading. Defaults to True.
        filter_set (set): a set of gene ids to keep from the data, removing those that don't match.  
            Transposes matrix after filtering if transpose == True.
    """
    self.adata = sc.read(self.matrixpath, cache = cache, delimiter = delimiter)

    if len(filter_set) != 0:
        self.adata = self.adata[self.adata.obs.index.isin(filter_set)].copy()

    if transpose:
        self.adata = self.adata.transpose()

regress_scale(how=['total_counts'], max_value=10)

Runs regression of specific features and scales data to a maximum value.

Parameters:

Name Type Description Default
how list

feature to regress out. Defaults to 'total_counts'.

['total_counts']
max_value float | int

maximum value to scale data.

10
Source code in utils/scanpy_wrappers.py
128
129
130
131
132
133
134
135
136
137
def regress_scale(self, how = ['total_counts'], max_value = 10):
    """Runs regression of specific features and scales data to a maximum value.

    Args:
        how (list): feature to regress out. Defaults to 'total_counts'. 
        max_value (float | int): maximum value to scale data.
    """

    sc.pp.regress_out(self.adata, how)
    sc.pp.scale(self.adata, max_value = max_value)

umap_leiden(n_neighbors=50, n_pcs=40, legend_loc='on data', save=True, plot=True)

Runs Leiden clustering, followed by UMAP, on the data.

Parameters:

Name Type Description Default
n_neighbors int

number of neighbors to pass to sc.pp.neighbors.

50
n_pcs int

number of Principal Components to use from PCA, passed to sc.pp.neighbors.

40
legend_loc str

where the legend should go (e.g. 'on data'.)

'on data'
save bool

whether to save the plot.

True
plot bool

whether to make the plot.

True
Source code in utils/scanpy_wrappers.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
def umap_leiden(self, n_neighbors=50, n_pcs = 40, legend_loc='on data', save = True, plot = True):
    """Runs Leiden clustering, followed by UMAP, on the data.

    Args:
        n_neighbors (int): number of neighbors to pass to `sc.pp.neighbors`.
        n_pcs (int): number of Principal Components to use from PCA, passed to `sc.pp.neighbors`.
        legend_loc (str): where the legend should go (e.g. `'on data'`.)
        save (bool): whether to save the plot.
        plot (bool): whether to make the plot.
    """
    sc.pp.neighbors(self.adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
    sc.tl.umap(self.adata)
    sc.tl.leiden(self.adata)

    if not plot:
        return None

    if save:
        sc.pl.umap(self.adata, color=['leiden'], legend_loc=legend_loc, save = '_'.join([self.species_prefix, self.datatype, 'leiden.pdf']))
    else:
        sc.pl.umap(self.adata, color=['leiden'], legend_loc=legend_loc)

variable_filter(min_mean=0.0125, max_mean=3, min_disp=0.1, max_disp=10, plot=True)

Filters genes to use for dimensionality reduction using max/min of mean and max/min of dispersion, plotting optionally.

Modifies underlying adata and saves original data under adata.raw.

Parameters:

Name Type Description Default
min_mean float | int

minimum mean expression of genes.

0.0125
max_mean float | int

maximum mean expression of genes.

3
min_disp float | int

minimum dispersion of gene expression.

0.1
max_disp float | int

maximum dispersion of gene expression.

10
plot bool

whether or not to generate a scatter plot showing cutoffs.

True
Source code in utils/scanpy_wrappers.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def variable_filter(self, min_mean = 0.0125, max_mean = 3, min_disp = 0.1, max_disp = 10, plot = True):
    """Filters genes to use for dimensionality reduction using max/min of mean and max/min of dispersion, plotting optionally.

    Modifies underlying `adata` and saves original data under `adata.raw`.

    Args:
        min_mean (float | int): minimum mean expression of genes.
        max_mean (float | int): maximum mean expression of genes.
        min_disp (float | int): minimum dispersion of gene expression.
        max_disp (float | int): maximum dispersion of gene expression.
        plot (bool): whether or not to generate a scatter plot showing cutoffs.
    """

    sc.pp.highly_variable_genes(self.adata, min_mean = min_mean, max_mean=max_mean, min_disp=min_disp, max_disp=max_disp)
    if plot:
        sc.pl.highly_variable_genes(self.adata)

    self.adata.raw = self.adata
    self.adata = self.adata[:, self.adata.var.highly_variable]

violin(x='total_counts', y='n_genes_by_counts', plot=True)

Runs sc.pp.calculate_qc_metrics, sc.pl.violin, and sc.pl.scatter.

Parameters:

Name Type Description Default
x str

label of the feature to be plotted on the x axis of the violin & scatter plot.
Defaults to 'total_counts'.

'total_counts'
y str

label of the feature to be plotted on the y axis of the violin & scatter plot. Defaults to 'n_genes_by_counts'.

'n_genes_by_counts'
Source code in utils/scanpy_wrappers.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def violin(self, x = 'total_counts', y = 'n_genes_by_counts', plot = True):
    """Runs sc.pp.calculate_qc_metrics, sc.pl.violin, and sc.pl.scatter.

    Args:
        x (str): label of the feature to be plotted on the x axis of the violin & scatter plot.  
            Defaults to 'total_counts'.
        y (str): label of the feature to be plotted on the y axis of the violin & scatter plot.
            Defaults to 'n_genes_by_counts'.
    """
    sc.pp.calculate_qc_metrics(self.adata, percent_top=None, log1p=False, inplace=True)

    if plot:
        sc.pl.violin(self.adata, [y, x],
         jitter=0.4, multi_panel=True)
        sc.pl.scatter(self.adata, x=x, y=y)

diagonalize_df(df)

Sorts values of a 2d dataframe along their diagonal for prettier plotting.

Source code in utils/scanpy_wrappers.py
324
325
326
327
328
329
330
331
332
def diagonalize_df(df):
    """Sorts values of a 2d dataframe along their diagonal for prettier plotting."""
    max_order = [list(df.loc[i]).index(max(df.loc[i])) for i in df.index]
    reordered = df.copy(deep = True)
    reordered['max_col'] = max_order
    reordered = reordered.sort_values(axis = 'index', by = 'max_col')
    reordered.drop('max_col', axis = 1, inplace = True)

    return reordered