Simulated Cancer Omics dataset using InterSim CRAN package
Import packages and IntegrAO code
[13]:
import numpy as np
import pandas as pd
import snf
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt
import sys
import os
import argparse
import torch
import umap
from sklearn.model_selection import train_test_split
[2]:
# Add the parent directory of "integrao" to the Python path
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
sys.path.append(module_path)
from integrao.dataset import GraphDataset
from integrao.main import dist2
from integrao.integrater import integrao_integrater
Set hyperparameters
[3]:
torch.cuda.is_available()
[3]:
True
[7]:
# Hyperparameters
neighbor_size = 20
embedding_dims = 64
fusing_iteration = 30
normalization_factor = 1.0
alighment_epochs = 1000
beta = 1.0
mu = 0.5
dataset_name = 'cancer omics'
cluster_number = 15
Read data
[5]:
testdata_dir = os.path.join(module_path, "data/omics/")
methyl_ = os.path.join(testdata_dir, "omics1.txt")
expr_ = os.path.join(testdata_dir, "omics2.txt")
protein_ = os.path.join(testdata_dir, "omics3.txt")
truelabel = os.path.join(testdata_dir, "clusters.txt")
methyl = pd.read_csv(methyl_, index_col=0, delimiter="\t")
expr = pd.read_csv(expr_, index_col=0, delimiter="\t")
protein = pd.read_csv(protein_, index_col=0, delimiter="\t")
truelabel = pd.read_csv(truelabel, index_col=0, delimiter="\t")
methyl = np.transpose(methyl)
expr = np.transpose(expr)
protein = np.transpose(protein)
print(methyl.shape)
print(expr.shape)
print(protein.shape)
print(truelabel.shape)
print("finish loading data!")
(500, 367)
(500, 131)
(500, 160)
(500, 2)
finish loading data!
KNN clustering for clustering cnv, meth and mirna raw features as baseline
[11]:
from sklearn.cluster import KMeans
from sklearn.metrics import v_measure_score
# cnv
kmeans = KMeans(n_clusters=cluster_number, random_state=42, n_init='auto').fit(methyl.values)
labels = kmeans.labels_
true_labels = truelabel['cluster.id'].tolist()
score = v_measure_score(true_labels, labels)
print("methyl raw features kmeans clustering NMI: {}".format(score))
# meth
kmeans = KMeans(n_clusters=cluster_number, random_state=42, n_init='auto').fit(expr.values)
labels = kmeans.labels_
true_labels = truelabel['cluster.id'].tolist()
score = v_measure_score(true_labels, labels)
print("expr raw features kmeans clustering NMI: {}".format(score))
# mirna
kmeans = KMeans(n_clusters=cluster_number, random_state=42, n_init='auto').fit(protein.values)
labels = kmeans.labels_
true_labels = truelabel['cluster.id'].tolist()
score = v_measure_score(true_labels, labels)
print("protein raw features kmeans clustering NMI: {}".format(score))
methyl raw features kmeans clustering NMI: 0.4092647077881033
expr raw features kmeans clustering NMI: 0.7651290634507789
protein raw features kmeans clustering NMI: 0.868987364136353
Random sub-sample the omics dataset to create an incomplete dataset
[14]:
ratio = 0.7
full_indices = range(len(methyl))
unique_indices, common_indices = train_test_split(full_indices, test_size=ratio)
w1w2_indices, w3_indices = train_test_split(unique_indices, test_size=0.33)
w1_indices, w2_indices = train_test_split(w1w2_indices, test_size=0.5)
w1_full_indices = common_indices + w1_indices
w2_full_indices = common_indices + w2_indices
w3_full_indices = common_indices + w3_indices
methyl_temp = methyl.iloc[w1_full_indices]
expr_temp = expr.iloc[w2_full_indices]
protein_temp = protein.iloc[w3_full_indices]
IntegrAO integration
[15]:
# Initialize integrater
integrater = integrao_integrater(
[methyl_temp, expr_temp, protein_temp],
dataset_name,
neighbor_size=neighbor_size,
embedding_dims=embedding_dims,
fusing_iteration=fusing_iteration,
normalization_factor=normalization_factor,
alighment_epochs=alighment_epochs,
beta=beta,
mu=mu,
)
# data indexing
fused_networks = integrater.network_diffusion()
embeds_final, S_final, model = integrater.unsupervised_alignment()
labels = spectral_clustering(S_final, n_clusters=cluster_number)
true_labels = truelabel.sort_values('subjects')['cluster.id'].tolist()
score_all = v_measure_score(true_labels, labels)
print("IntegrAO for clustering union 500 samples NMI score: ", score_all)
Start indexing input expression matrices!
Common sample between view0 and view1: 350
Common sample between view0 and view2: 350
Common sample between view1 and view2: 350
Neighbor size: 20
Start applying diffusion!
Diffusion ends! Times: 15.066577911376953s
Starting unsupervised exmbedding extraction!
Dataset 0: (400, 367)
Dataset 1: (400, 131)
Dataset 2: (400, 160)
epoch 0: loss 30.054597854614258, align_loss:0.775736
epoch 100: loss 20.709983825683594, align_loss:0.151950
epoch 200: loss 1.092381477355957, align_loss:0.082980
epoch 300: loss 1.0914002656936646, align_loss:0.082236
epoch 400: loss 1.0902798175811768, align_loss:0.081427
epoch 500: loss 1.0890438556671143, align_loss:0.080614
epoch 600: loss 1.0877360105514526, align_loss:0.079772
epoch 700: loss 1.0863633155822754, align_loss:0.078964
epoch 800: loss 1.0849252939224243, align_loss:0.078173
epoch 900: loss 1.0834227800369263, align_loss:0.077401
Manifold alignment ends! Times: 56.45441031455994s
IntegrAO for clustering union 500 samples NMI score: 0.9862486593875475
UMAP plot of the integrated embeddings
[16]:
import umap
# umap visualization of embeddings with hue as labels
reducer = umap.UMAP(n_neighbors=30, min_dist=1.0, spread=1.0)
embedding = reducer.fit_transform(embeds_final.values)
plt.figure(figsize=(10, 10))
plt.scatter(
embedding[:, 0],
embedding[:, 1],
c=true_labels,
s=6,
cmap="Spectral",
alpha=1.0,
)
plt.setp(plt.gca(), xticks=[], yticks=[])
plt.title("UMAP projection of the dataset", fontsize=24)
plt.colorbar()
plt.show()