New patient classification with incomplete omics profiles

Import packages and IntegrAO code

[1]:

import numpy as np
import pandas as pd
import snf
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt

import sys
import os
import argparse
import torch

import umap
from sklearn.model_selection import train_test_split

[2]:

# Add the parent directory of "integrao" to the Python path
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from integrao.dataset import GraphDataset
from integrao.main import dist2
from integrao.integrater import integrao_integrater, integrao_predictor

Set hyperparameters

[3]:

torch.cuda.is_available()

[3]:

True

[4]:

# Hyperparameters
neighbor_size = 20
embedding_dims = 64
fusing_iteration = 30
normalization_factor = 1.0
alighment_epochs = 1000
beta = 1.0
mu = 0.5


dataset_name = 'cancer_omics_prediction'
cluster_number = 15

[5]:

# create result dir
result_dir = os.path.join(
    module_path, "results/{}".format(dataset_name)
)
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

Read data

[6]:

testdata_dir = os.path.join(module_path, "data/omics/")

methyl_ = os.path.join(testdata_dir, "omics1.txt")
expr_ = os.path.join(testdata_dir, "omics2.txt")
protein_ = os.path.join(testdata_dir, "omics3.txt")
truelabel = os.path.join(testdata_dir, "clusters.txt")


methyl = pd.read_csv(methyl_, index_col=0, delimiter="\t")
expr = pd.read_csv(expr_, index_col=0, delimiter="\t")
protein = pd.read_csv(protein_, index_col=0, delimiter="\t")
truelabel = pd.read_csv(truelabel, index_col=0, delimiter="\t")

methyl = np.transpose(methyl)
expr = np.transpose(expr)
protein = np.transpose(protein)
print(methyl.shape)
print(expr.shape)
print(protein.shape)
print(truelabel.shape)
print("finish loading data!")

(500, 367)
(500, 131)
(500, 160)
(500, 2)
finish loading data!

Random stratified-subsample 80%-20% samples to simulate the senario of incomplete omics dataset

[7]:

truelabel

[7]:

	subjects	cluster.id
1	subject1	6
2	subject2	7
3	subject3	9
4	subject4	6
5	subject5	4
...	...	...
496	subject496	1
497	subject497	14
498	subject498	4
499	subject499	1
500	subject500	9

500 rows × 2 columns

[8]:

common_patient = methyl.index
y = truelabel['cluster.id'].tolist()

X_train, X_test, y_train, y_test = train_test_split(common_patient, y, stratify=y, test_size=0.2)

# get the reference and query data
methyl_ref = methyl.loc[X_train]
expr_ref = expr.loc[X_train]
protein_ref = protein.loc[X_train]

methyl_query = methyl.loc[X_test]
expr_query = expr.loc[X_test]
protein_query = protein.loc[X_test]

Now let’s intergrate the reference data

[9]:

# Initialize integrater
integrater = integrao_integrater(
    [methyl_ref, expr_ref, protein_ref],
    dataset_name,
    modalities_name_list=["methyl", "expr", "protein"],   # used for naming the incomplete modalities during new sample inference
    neighbor_size=neighbor_size,
    embedding_dims=embedding_dims,
    fusing_iteration=fusing_iteration,
    normalization_factor=normalization_factor,
    alighment_epochs=alighment_epochs,
    beta=beta,
    mu=mu,
)
# data indexing
fused_networks = integrater.network_diffusion()
embeds_final, S_final, model = integrater.unsupervised_alignment()

# save the model for fine-tuning
torch.save(model.state_dict(), os.path.join(result_dir, "model.pth"))

Start indexing input expression matrices!
Common sample between view0 and view1: 400
Common sample between view0 and view2: 400
Common sample between view1 and view2: 400
Neighbor size: 20
Start applying diffusion!
Diffusion ends! Times: 8.610264778137207s
Starting unsupervised exmbedding extraction!
Dataset 0: (400, 367)
Dataset 1: (400, 131)
Dataset 2: (400, 160)
epoch 0: loss 27.511960983276367, align_loss:0.731402
epoch 100: loss 19.206762313842773, align_loss:0.109023
epoch 200: loss 0.6979491114616394, align_loss:0.062670
epoch 300: loss 0.697240948677063, align_loss:0.062080
epoch 400: loss 0.69642174243927, align_loss:0.061394
epoch 500: loss 0.6955264210700989, align_loss:0.060751
epoch 600: loss 0.6945570707321167, align_loss:0.060056
epoch 700: loss 0.6935387253761292, align_loss:0.059397
epoch 800: loss 0.692470371723175, align_loss:0.058727
epoch 900: loss 0.6913514733314514, align_loss:0.058086
Manifold alignment ends! Times: 20.610440015792847s

[10]:

labels = spectral_clustering(S_final, n_clusters=cluster_number)

# select from truelabel based on the 'subjects' column in embeds_final
truelabel_filtered = truelabel[truelabel['subjects'].isin(embeds_final.index)]
truelabel_filtered = truelabel_filtered.sort_values('subjects')['cluster.id'].tolist()

score_all = v_measure_score(truelabel_filtered, labels)
print("IntegrAO for clustering reference 400 samples NMI score: ", score_all)

IntegrAO for clustering reference 400 samples NMI score:  1.0

Now to perform fine-tuning using on the ground true labels

[11]:

truelabel_sub = truelabel[truelabel['subjects'].isin(embeds_final.index)]
truelabel_sub = truelabel_sub.set_index('subjects')

# minus 1 for the cluster id to avoid CUDA error
truelabel_sub['cluster.id'] = truelabel_sub['cluster.id'] - 1
truelabel_sub

[11]:

	cluster.id
subjects
subject2	6
subject3	8
subject4	5
subject5	3
subject6	10
...	...
subject495	8
subject496	0
subject497	13
subject499	0
subject500	8

400 rows × 1 columns

[12]:

embeds_final, S_final, model, preds = integrater.classification_finetuning(truelabel_sub, result_dir, finetune_epochs=800)

Starting supervised fineting!
Dataset 0: (400, 367)
Dataset 1: (400, 131)
Dataset 2: (400, 160)
IntegrAO(
  (feature): ModuleList(
    (0): GraphSAGE(367, 64, num_layers=2)
    (1): GraphSAGE(131, 64, num_layers=2)
    (2): GraphSAGE(160, 64, num_layers=2)
  )
  (feature_show): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.1, inplace=True)
    (3): Linear(in_features=64, out_features=64, bias=True)
  )
  (pred_head): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.1, inplace=True)
    (3): Linear(in_features=32, out_features=15, bias=True)
  )
)
Loaded pre-trained model with success.
epoch 0: loss 3.6955952644348145, kl_loss:0.641008, align_loss:0.054019, clf_loss:3.000568
epoch 100: loss 0.6305103302001953, kl_loss:0.589171, align_loss:0.041248, clf_loss:0.000092
epoch 200: loss 0.6225103735923767, kl_loss:0.582280, align_loss:0.040149, clf_loss:0.000082
epoch 300: loss 0.6215760707855225, kl_loss:0.581475, align_loss:0.040019, clf_loss:0.000082
epoch 400: loss 0.6204578876495361, kl_loss:0.580505, align_loss:0.039872, clf_loss:0.000081
epoch 500: loss 0.6191694140434265, kl_loss:0.579405, align_loss:0.039684, clf_loss:0.000081
epoch 600: loss 0.6176994442939758, kl_loss:0.578089, align_loss:0.039530, clf_loss:0.000081
epoch 700: loss 0.6160817742347717, kl_loss:0.576677, align_loss:0.039322, clf_loss:0.000082
Manifold alignment ends! Times: 40.30371856689453s

Now to perform inference on query data

[36]:

# Network fusion for the whole graph
predictor = integrao_predictor(
    [methyl, expr, protein],
    dataset_name,
    modalities_name_list=["methyl", "expr", "protein"],
    neighbor_size=neighbor_size,
    embedding_dims=embedding_dims,
    fusing_iteration=fusing_iteration,
    normalization_factor=normalization_factor,
    alighment_epochs=alighment_epochs,
    beta=beta,
    mu=mu,
)
# data indexing
fused_networks = predictor.network_diffusion()

Start indexing input expression matrices!
Common sample between view0 and view1: 500
Common sample between view0 and view2: 500
Common sample between view1 and view2: 500
Neighbor size: 20
Start applying diffusion!
Diffusion ends! Times: 11.70268440246582s

[41]:

from sklearn.metrics import accuracy_score, f1_score

# helper function to get the metrics on test set
def get_metrics(preds, preds_index, X_test, y_test):

    pred_df = pd.DataFrame(data=preds, index=preds_index)
    pred_df_test = pred_df.loc[X_test]

    # add 1 back to the cluster id
    pred_df_test = pred_df_test + 1

    f1_micro = f1_score(y_test, pred_df_test, average='micro')
    f1_weighted = f1_score(y_test, pred_df_test, average='weighted')
    acc = accuracy_score(y_test, pred_df_test)

    return f1_micro, f1_weighted, acc

Using one modalities

[42]:

# for methyl
preds = predictor.inference(model, new_datasets=[methyl], modalities_names=["methyl"])

f1_micro, f1_weight, acc = get_metrics(preds, methyl.index, X_test, y_test)

print("methyl f1_micro: ", f1_micro)
print("methyl f1_weight: ", f1_weight)
print("methyl acc: ", acc)

methyl f1_micro:  0.97
methyl f1_weight:  0.9686984126984127
methyl acc:  0.97

[43]:

# for expr
preds = predictor.inference(model, new_datasets=[expr], modalities_names=["expr"])

f1_micro, f1_weight, acc = get_metrics(preds, expr.index, X_test, y_test)

print("expr f1_micro: ", f1_micro)
print("expr f1_weight: ", f1_weight)
print("expr acc: ", acc)

expr f1_micro:  1.0
expr f1_weight:  1.0
expr acc:  1.0

[44]:

# for protein
preds = predictor.inference(model, new_datasets=[protein], modalities_names=["protein"])

f1_micro, f1_weight, acc = get_metrics(preds, protein.index, X_test, y_test)


print("protein f1_micro: ", f1_micro)
print("protein f1_weight: ", f1_weight)
print("protein acc: ", acc)

protein f1_micro:  1.0
protein f1_weight:  1.0
protein acc:  1.0

Two modalities

[45]:

# methyl and expr
preds = predictor.inference(model, new_datasets=[methyl, expr], modalities_names=["methyl", "expr"])

f1_micro, f1_weight, acc = get_metrics(preds, methyl.index, X_test, y_test)

print("methyl+expr f1_micro: ", f1_micro)
print("methyl+expr f1_weight: ", f1_weight)
print("methyl+expr acc: ", acc)

methyl+expr f1_micro:  1.0
methyl+expr f1_weight:  1.0
methyl+expr acc:  1.0

[46]:

# methyl and protein
preds = predictor.inference(model, new_datasets=[methyl, protein], modalities_names=["methyl", "protein"])

f1_micro, f1_weight, acc = get_metrics(preds, methyl.index, X_test, y_test)

print("methyl+protein f1_micro: ", f1_micro)
print("methyl+protein f1_weight: ", f1_weight)
print("methyl+protein acc: ", acc)

methyl+protein f1_micro:  1.0
methyl+protein f1_weight:  1.0
methyl+protein acc:  1.0

[47]:

# expr and protein
preds = predictor.inference(model, new_datasets=[expr, protein], modalities_names=["expr", "protein"])

f1_micro, f1_weight, acc = get_metrics(preds, expr.index, X_test, y_test)

print("expr+protein f1_micro: ", f1_micro)
print("expr+protein f1_weight: ", f1_weight)
print("expr+protein acc: ", acc)

expr+protein f1_micro:  1.0
expr+protein f1_weight:  1.0
expr+protein acc:  1.0

Three modalities

[48]:

# methyl, expr and protein
preds = predictor.inference(model, new_datasets=[methyl, expr, protein], modalities_names=["methyl", "expr", "protein"])

f1_micro, f1_weight, acc = get_metrics(preds, methyl.index, X_test, y_test)

print("methyl+expr+protein f1_micro: ", f1_micro)
print("methyl+expr+protein f1_weight: ", f1_weight)
print("methyl+expr+protein acc: ", acc)

methyl+expr+protein f1_micro:  1.0
methyl+expr+protein f1_weight:  1.0
methyl+expr+protein acc:  1.0