Integrating scCAT-seq cell lines#

Imports#

[ ]:

# Biology
import muon as mu
import scanpy as sc

import mowgli

[2]:

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

Load data#

Load the Liu cell lines dataset, which contains 3 well separated cell lines profiled with scRNA-seq and scATAC-seq. You can download the data from https://figshare.com/s/fadcb181b470d32c73c7

[ ]:

mdata = mu.read_h5mu("liu_preprocessed.h5mu.gz")

For computational reasons, in this simple demonstration we reduce the number of features a lot.

[4]:

sc.pp.highly_variable_genes(mdata["rna"], n_top_genes=500)
sc.pp.highly_variable_genes(mdata["atac"], n_top_genes=500)

Visualize independant modalities#

Let us visualize the cell lines independantly using UMAP projections.

[5]:

# Umap RNA
sc.pp.scale(mdata["rna"], zero_center=False)
sc.tl.pca(mdata["rna"], svd_solver="arpack")
sc.pp.neighbors(mdata["rna"], n_neighbors=10, n_pcs=10)
sc.tl.umap(mdata["rna"], spread=1.5, min_dist=0.5)

[6]:

# Umap ATAC
sc.pp.scale(mdata["atac"], zero_center=False)
sc.tl.pca(mdata["atac"], svd_solver="arpack")
sc.pp.neighbors(mdata["atac"], n_neighbors=10, n_pcs=10)
sc.tl.umap(mdata["atac"], spread=1.5, min_dist=0.5)

[7]:

# Display UMAP
fig, axes = plt.subplots(1, 2, figsize=(8, 3))
sc.pl.umap(
    mdata["rna"],
    color="celltype",
    legend_loc="on data",
    size=50,
    show=False,
    ax=axes[0],
)
sc.pl.umap(
    mdata["atac"],
    color="celltype",
    legend_loc="on data",
    size=50,
    show=False,
    ax=axes[1],
)
plt.show()

../_images/vignettes_Liu_cell_lines_13_0.png

Train model#

Let us define the model and perform the dimensionaly reduction.

[8]:

# Define the model
model = mowgli.models.MowgliModel(latent_dim=5)

[9]:

# Perform the training.
model.train(mdata)

  4%|▍         | 8/200 [00:08<03:32,  1.11s/it, loss=-0.11106829, mass_transported=0.757, loss_inner=-0.053246647, inner_steps=60, gpu_memory_allocated=0]

Visualize the embedding#

Now, let us display the obtained embedding.

[10]:

# First using a UMAP plot. This is pure Scanpy!
sc.pp.neighbors(mdata, use_rep="W_OT", key_added="mowgli")
sc.tl.umap(mdata, min_dist=0.5, spread=1.5, neighbors_key="mowgli")
sc.pl.umap(mdata, color="rna:celltype", size=200, alpha=0.7, legend_loc="on data")

../_images/vignettes_Liu_cell_lines_20_0.png

[11]:

# Then using a dendogram.
mowgli.pl.clustermap(
    mdata,
    obsm="W_OT",
    yticklabels=False,
    figsize=(5, 5),
    col_cluster=False,
)

../_images/vignettes_Liu_cell_lines_21_0.png

[12]:

# Then, using a heatmap.
mowgli.pl.heatmap(
    mdata,
    obsm="W_OT",
    groupby="rna:celltype",
    figsize=(5, 5),
)

/users/csb/huizing/anaconda3/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:2414: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
  obs_tidy.index.value_counts(sort=False).iteritems()

../_images/vignettes_Liu_cell_lines_22_1.png

[13]:

# Finally, display a violin plot of the value at a given dimension of the cells.
mowgli.pl.factor_violin(mdata, groupby="rna:celltype", dim=1)

../_images/vignettes_Liu_cell_lines_23_0.png

Clustering#

We can perform clustering on the lower-dimensional space.

[14]:

# Again, pure Scanpy for clustering.
sc.tl.leiden(mdata, resolution=0.1, neighbors_key="mowgli")

[15]:

# Let's display the clustering results on the previously computed UMAP.
sc.pl.umap(
    mdata, color=["rna:celltype", "leiden"], size=200, alpha=0.7, legend_loc="on data"
)

../_images/vignettes_Liu_cell_lines_27_0.png