From 39a64a7db238813df56ab2899d60f6693fcafa78 Mon Sep 17 00:00:00 2001 From: SuperXiang <12507538@qq.com> Date: Wed, 1 Dec 2021 14:29:45 +0800 Subject: [PATCH] update hybriddta --- .../hybriddta/README.md | 34 ++- .../hybriddta/pointwise/DeepDTA/get_len.py | 8 +- .../pointwise/DeepDTA/train_bindingdb.py | 10 +- .../pointwise/DeepDTA/train_davis.py | 6 +- .../hybriddta/pointwise/DeepDTA/train_kiba.py | 32 ++- .../hybriddta/pointwise/GraphDTA/get_len.py | 35 +++ .../pointwise/GraphDTA/models/gat.py | 74 +++++ .../pointwise/GraphDTA/models/gat_gcn.py | 74 +++++ .../pointwise/GraphDTA/models/gcn.py | 77 +++++ .../pointwise/GraphDTA/models/ginconv.py | 91 ++++++ .../pointwise/GraphDTA/preprocess.py | 110 +++++++ .../pointwise/GraphDTA/processing.py | 91 ++++++ .../pointwise/GraphDTA/train_bindingDB.py | 271 ++++++++++++++++++ .../pointwise/GraphDTA/train_davis.py | 205 +++++++++++++ .../pointwise/GraphDTA/train_kiba.py | 245 ++++++++++++++++ .../hybriddta/pointwise/GraphDTA/utils.py | 125 ++++++++ .../pointwise/GraphDTA/utils_bindingDB.py | 116 ++++++++ .../hybriddta/pointwise/Moltrans/get_len.py | 8 +- .../pointwise/Moltrans/train_bindingdb.py | 10 +- .../pointwise/Moltrans/train_davis.py | 6 +- .../pointwise/Moltrans/train_kiba.py | 28 +- 21 files changed, 1606 insertions(+), 50 deletions(-) create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/get_len.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat_gcn.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gcn.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/ginconv.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/preprocess.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/processing.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_bindingDB.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_davis.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_kiba.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils.py create mode 100644 apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils_bindingDB.py diff --git a/apps/drug_target_interaction/hybriddta/README.md b/apps/drug_target_interaction/hybriddta/README.md index 0a3717a8..afd2dcb4 100644 --- a/apps/drug_target_interaction/hybriddta/README.md +++ b/apps/drug_target_interaction/hybriddta/README.md @@ -1,6 +1,6 @@ # HybridDTA -Source code for paper: "HybridDTA: Hybrid Data Fusion through Pairwise Training for Drug-Target Affinity Prediction". +Source code for paper: "HybridDTA: Hybrid Data Fusion through Pairwise Training for Drug-Target Affinity Prediction". The preprint version is currently released on [bioRxiv](https://www.biorxiv.org/content/10.1101/2021.11.23.469641v1). ## Backgrounds @@ -72,7 +72,7 @@ python run_pairwise_Moltrans_bindingDB.py --data_path '../../Data/' "--is_mixed ### Baseline -We reimplement and provide all the baseline backbone models as following. +We reproduce and provide all the baseline backbone models as following. #### DeepDTA ```bash @@ -95,6 +95,18 @@ CUDA_VISIBLE_DEVICES=0 python train_bindingdb.py --batchsize 256 --epochs 50 --r ```bash cd ./pointwise/GraphDTA ``` +##### run the training script for Davis with cross-validation +```bash +python train_davis.py --batchsize 512 --epochs 100 --rounds 1 --lr 5e-4 --cudanum 0 --model 2 +``` +##### run the training script for KIBA with cross-validation +```bash +python train_kiba.py --batchsize 512 --epochs 200 --rounds 1 --lr 5e-4 --cudanum 0 --model 2 +``` +##### run the training script for BindingDB +```bash +python train_bindingdb.py --batchsize 512 --epochs 50 --rounds 1 --lr 5e-4 --cudanum 0 --model 2 +``` #### Moltrans ```bash @@ -114,6 +126,24 @@ CUDA_VISIBLE_DEVICES=0 python train_bindingdb.py --batchsize 64 --epochs 50 --ro ``` +## Citation + +If you find our work is helpful in your research, please cite: +```bibtex +@article {Luo2021.11.23.469641, + author = {Luo, Hongyu and Xiang, Yingfei and Fang, Xiaomin and Lin, Wei and Wang, Fan and Wu, Hua and Wang, Haifeng}, + title = {HybridDTA: Hybrid Data Fusion through Pairwise Training for Drug-Target Affinity Prediction}, + elocation-id = {2021.11.23.469641}, + year = {2021}, + doi = {10.1101/2021.11.23.469641}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2021/11/23/2021.11.23.469641}, + eprint = {https://www.biorxiv.org/content/early/2021/11/23/2021.11.23.469641.full.pdf}, + journal = {bioRxiv} +} +``` + + ## Reference **DAVIS** diff --git a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/get_len.py b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/get_len.py index 019a73ea..a72f5240 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/get_len.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/get_len.py @@ -20,16 +20,16 @@ def get_kiba_len(): # Get length of validation set for cv in ["CV1", "CV2", "CV3", "CV4", "CV5"]: - df = pd.read_csv("../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv") + df = pd.read_csv("../../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv") df = df.groupby(['Target ID']).size().reset_index(name = 'counts') - f = open("../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a') + f = open("../../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a') for i in df['counts'].values: f.write(str(i) + "\n") # Get length of testing set - df = pd.read_csv("../Data/KIBA/test_KIBA_unseenP_seenD.csv") + df = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") df = df.groupby(['Target ID']).size().reset_index(name = 'counts') - f = open("../Data/KIBA/kiba_len.txt",'a') + f = open("../../Data/KIBA/kiba_len.txt",'a') for i in df['counts'].values: f.write(str(i) + "\n") \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_bindingdb.py b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_bindingdb.py index d36dcc6b..8eec40fe 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_bindingdb.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_bindingdb.py @@ -106,9 +106,9 @@ def main(args): optim = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate = args.lr) # Adam # Load raw data - train_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv") - val_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv") - test_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv") + train_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv") + val_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv") + test_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv") train_set = BindingDB_Encoder(train_data.index.values, train_data) val_set = BindingDB_Encoder(val_data.index.values, val_data) @@ -176,8 +176,8 @@ def main(args): model_name = "bestModel/DeepDTA_BindingDB_ki_"+str(rounds)+".model" # Save the best result - if weight_ci > best_ci: - best_ci = weight_ci + if average_ci > best_ci: + best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model diff --git a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_davis.py b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_davis.py index e9c68eba..fa59c086 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_davis.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_davis.py @@ -103,9 +103,9 @@ def main(args): optim = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate = args.lr) # Adam # Load raw data - train_data = pd.read_csv("../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv") - val_data = pd.read_csv("../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv") - test_data = pd.read_csv("../Data/DAVIS/test_DAVIS_unseenP_seenD.csv") + train_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv") + val_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv") + test_data = pd.read_csv("../../Data/DAVIS/test_DAVIS_unseenP_seenD.csv") train_set = Basic_Encoder(train_data.index.values, train_data) val_set = Basic_Encoder(val_data.index.values, val_data) diff --git a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_kiba.py b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_kiba.py index 47f42788..9788651e 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_kiba.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/DeepDTA/train_kiba.py @@ -116,9 +116,9 @@ def main(args): optim = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate = args.lr) # Adam # Load raw data - train_data = pd.read_csv("../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv") - val_data = pd.read_csv("../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv") - test_data = pd.read_csv("../Data/KIBA/test_KIBA_unseenP_seenD.csv") + train_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv") + val_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv") + test_data = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") train_set = Basic_Encoder(train_data.index.values, train_data) val_set = Basic_Encoder(val_data.index.values, val_data) @@ -138,18 +138,21 @@ def main(args): G, P = predicting(model, val_loader) val_ci = concordance_index(G,P) - val_path = "../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt" + val_path = "../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt" # Check if kiba len file exists if(path.exists(val_path) == False): get_kiba_len() # Calculate Weighted CI, Average CI of validation set - li ,lens = cal_len(val_path) + li, lens = cal_len(val_path) s = 0 w_ci,a_ci = [],[] for l in li: - w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) - a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + try: + w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) + a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + except: + pass s += l weight_ci, average_ci = np.sum(w_ci)/lens, np.mean(a_ci) @@ -163,8 +166,8 @@ def main(args): model_name = "bestModel/MolTrans_kiba_"+str(rounds)+".model" # Save the best result - if weight_ci > best_ci: - best_ci = weight_ci + if average_ci > best_ci: + best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model @@ -180,17 +183,20 @@ def main(args): test_G, test_P = predicting(model, test_loader) test_CI,test_MSE = concordance_index(test_G,test_P), mse(test_G,test_P) - test_path = "../Data/KIBA/kiba_len.txt" + test_path = "../../Data/KIBA/kiba_len.txt" # Check if kiba len file exists if(path.exists(test_path) == False): get_kiba_len() # Calculate Weighted CI, Average CI of testing set t_li ,t_lens = cal_len(test_path) s = 0 - w_ci,a_ci = [],[] + w_ci, a_ci = [], [] for l in t_li: - w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) - a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + try: + w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) + a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + except: + pass s += l test_weight_ci, test_average_ci = np.sum(w_ci)/t_lens, np.mean(a_ci) diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/get_len.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/get_len.py new file mode 100644 index 00000000..a72f5240 --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/get_len.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Calculate length of each group in dataset.""" + +import pandas as pd + + +def get_kiba_len(): + # Get length of validation set + for cv in ["CV1", "CV2", "CV3", "CV4", "CV5"]: + df = pd.read_csv("../../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv") + df = df.groupby(['Target ID']).size().reset_index(name = 'counts') + f = open("../../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a') + for i in df['counts'].values: + f.write(str(i) + "\n") + + + # Get length of testing set + df = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") + df = df.groupby(['Target ID']).size().reset_index(name = 'counts') + f = open("../../Data/KIBA/kiba_len.txt",'a') + for i in df['counts'].values: + f.write(str(i) + "\n") \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat.py new file mode 100644 index 00000000..15b21f17 --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat.py @@ -0,0 +1,74 @@ +"""GraphDTA_GAT backbone model.""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Sequential, Linear, ReLU +from torch_geometric.nn import GATConv +from torch_geometric.nn import global_max_pool as gmp + + +# GAT backbone model +class GATNet(torch.nn.Module): + """GAT model. + + Args: + data: Input data. + + Returns: + out: Prediction results. + """ + def __init__(self, num_features_xd=78, n_output=1, num_features_xt=25, + n_filters=32, embed_dim=128, output_dim=128, dropout=0.2): + super(GATNet, self).__init__() + # Basic config + self.relu = nn.ReLU() + self.dropout = nn.Dropout(dropout) + # SMILES graph branch + self.gcn1 = GATConv(num_features_xd, num_features_xd, heads=10, dropout=dropout) + self.gcn2 = GATConv(num_features_xd * 10, output_dim, dropout=dropout) + self.fc_g1 = nn.Linear(output_dim, output_dim) + # Protein sequence branch (1d conv) + self.embedding_xt = nn.Embedding(num_features_xt + 1, embed_dim) + self.conv_xt1 = nn.Conv1d(in_channels=1000, out_channels=n_filters, kernel_size=8) + self.fc_xt1 = nn.Linear(32*121, output_dim) + # Combined layers + self.fc1 = nn.Linear(256, 1024) + self.fc2 = nn.Linear(1024, 256) + self.out = nn.Linear(256, n_output) + + def forward(self, data): + """tbd.""" + # Get graph input + x, edge_index, batch = data.x, data.edge_index, data.batch + # Get protein input + target = data.target + + x = F.dropout(x, p=0.2, training=self.training) + x = F.elu(self.gcn1(x, edge_index)) + + x = F.dropout(x, p=0.2, training=self.training) + x = self.gcn2(x, edge_index) + x = self.relu(x) + x = gmp(x, batch) # global max pooling + + x = self.fc_g1(x) + x = self.relu(x) + # 1d conv layers + embedded_xt = self.embedding_xt(target) + conv_xt = self.conv_xt1(embedded_xt) + conv_xt = self.relu(conv_xt) + # Flatten + xt = conv_xt.view(-1, 32 * 121) + xt = self.fc_xt1(xt) + # Concat + xc = torch.cat((x, xt), 1) + # Add some dense layers + xc = self.fc1(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + xc = self.fc2(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + out = self.out(xc) + return out diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat_gcn.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat_gcn.py new file mode 100644 index 00000000..cabcaf9d --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gat_gcn.py @@ -0,0 +1,74 @@ +"""GraphDTA_GATGCN backbone model.""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Sequential, Linear, ReLU +from torch_geometric.nn import GCNConv, GATConv, GINConv, global_add_pool +from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp + + +# GATGCN backbone model +class GAT_GCN(torch.nn.Module): + """GATGCN model. + + Args: + data: Input data. + + Returns: + out: Prediction results. + """ + def __init__(self, n_output=1, num_features_xd=78, num_features_xt=25, + n_filters=32, embed_dim=128, output_dim=128, dropout=0.2): + super(GAT_GCN, self).__init__() + # Basic config + self.relu = nn.ReLU() + self.dropout = nn.Dropout(dropout) + self.n_output = n_output + # SMILES graph branch + self.conv1 = GATConv(num_features_xd, num_features_xd, heads=10) + self.conv2 = GCNConv(num_features_xd*10, num_features_xd*10) + self.fc_g1 = torch.nn.Linear(num_features_xd*10*2, 1500) + self.fc_g2 = torch.nn.Linear(1500, output_dim) + # Protein sequence branch (1d conv) + self.embedding_xt = nn.Embedding(num_features_xt + 1, embed_dim) + self.conv_xt_1 = nn.Conv1d(in_channels=1000, out_channels=n_filters, kernel_size=8) + self.fc1_xt = nn.Linear(32*121, output_dim) + # Combined layers + self.fc1 = nn.Linear(256, 1024) + self.fc2 = nn.Linear(1024, 512) + self.out = nn.Linear(512, self.n_output) # n_output = 1 for regression task + + def forward(self, data): + """tbd.""" + # Get graph input + x, edge_index, batch = data.x, data.edge_index, data.batch + # Get protein input + target = data.target + + x = self.conv1(x, edge_index) + x = self.relu(x) + x = self.conv2(x, edge_index) + x = self.relu(x) + # Apply global max pooling (gmp) and global mean pooling (gap) + x = torch.cat([gmp(x, batch), gap(x, batch)], dim=1) + x = self.relu(self.fc_g1(x)) + x = self.dropout(x) + x = self.fc_g2(x) + + embedded_xt = self.embedding_xt(target) + conv_xt = self.conv_xt_1(embedded_xt) + # Flatten + xt = conv_xt.view(-1, 32 * 121) + xt = self.fc1_xt(xt) + # Concat + xc = torch.cat((x, xt), 1) + # Add some dense layers + xc = self.fc1(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + xc = self.fc2(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + out = self.out(xc) + return out diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gcn.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gcn.py new file mode 100644 index 00000000..67024a85 --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/gcn.py @@ -0,0 +1,77 @@ +"""GraphDTA_GCN backbone model.""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch_geometric.nn import GCNConv, global_max_pool as gmp + + +# GCN backbone model +class GCNNet(torch.nn.Module): + """GCN model. + + Args: + data: Input data. + + Returns: + out: Prediction results. + """ + def __init__(self, n_output=1, n_filters=32, embed_dim=128,num_features_xd=78, num_features_xt=25, output_dim=128, dropout=0.2): + super(GCNNet, self).__init__() + # Basic config + self.relu = nn.ReLU() + self.dropout = nn.Dropout(dropout) + # SMILES graph branch + self.n_output = n_output + self.conv1 = GCNConv(num_features_xd, num_features_xd) + self.conv2 = GCNConv(num_features_xd, num_features_xd*2) + self.conv3 = GCNConv(num_features_xd*2, num_features_xd * 4) + self.fc_g1 = torch.nn.Linear(num_features_xd*4, 1024) + self.fc_g2 = torch.nn.Linear(1024, output_dim) + # Protein sequence branch (1d conv) + self.embedding_xt = nn.Embedding(num_features_xt + 1, embed_dim) + self.conv_xt_1 = nn.Conv1d(in_channels=1000, out_channels=n_filters, kernel_size=8) + self.fc1_xt = nn.Linear(32*121, output_dim) + # Combined layers + self.fc1 = nn.Linear(2*output_dim, 1024) + self.fc2 = nn.Linear(1024, 512) + self.out = nn.Linear(512, self.n_output) + + def forward(self, data): + """tbd.""" + # Get graph input + x, edge_index, batch = data.x, data.edge_index, data.batch + # Get protein input + target = data.target + + x = self.conv1(x, edge_index) + x = self.relu(x) + + x = self.conv2(x, edge_index) + x = self.relu(x) + + x = self.conv3(x, edge_index) + x = self.relu(x) + x = gmp(x, batch) # global max pooling + # Flatten + x = self.relu(self.fc_g1(x)) + x = self.dropout(x) + x = self.fc_g2(x) + x = self.dropout(x) + # 1d conv layers + embedded_xt = self.embedding_xt(target) + conv_xt = self.conv_xt_1(embedded_xt) + # Flatten + xt = conv_xt.view(-1, 32 * 121) + xt = self.fc1_xt(xt) + # Concat + xc = torch.cat((x, xt), 1) + # Add some dense layers + xc = self.fc1(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + xc = self.fc2(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + out = self.out(xc) + return out diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/ginconv.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/ginconv.py new file mode 100644 index 00000000..f946f281 --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/models/ginconv.py @@ -0,0 +1,91 @@ +"""GraphDTA_GIN backbone model.""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Sequential, Linear, ReLU +from torch_geometric.nn import GINConv, global_add_pool +from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp + + +# GINConv backbone model +class GINConvNet(torch.nn.Module): + """GINConv model. + + Args: + data: Input data. + + Returns: + out: Prediction results. + """ + def __init__(self, n_output=1, num_features_xd=78, num_features_xt=25, + n_filters=32, embed_dim=128, output_dim=128, dropout=0.2): + super(GINConvNet, self).__init__() + # Basic config + dim = 32 + self.dropout = nn.Dropout(dropout) + self.relu = nn.ReLU() + self.n_output = n_output + # SMILES graph branch + nn1 = Sequential(Linear(num_features_xd, dim), ReLU(), Linear(dim, dim)) + self.conv1 = GINConv(nn1) + self.bn1 = torch.nn.BatchNorm1d(dim) + nn2 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) + self.conv2 = GINConv(nn2) + self.bn2 = torch.nn.BatchNorm1d(dim) + nn3 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) + self.conv3 = GINConv(nn3) + self.bn3 = torch.nn.BatchNorm1d(dim) + nn4 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) + self.conv4 = GINConv(nn4) + self.bn4 = torch.nn.BatchNorm1d(dim) + nn5 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) + self.conv5 = GINConv(nn5) + self.bn5 = torch.nn.BatchNorm1d(dim) + self.fc1_xd = Linear(dim, output_dim) + # Protein sequence branch (1d conv) + self.embedding_xt = nn.Embedding(num_features_xt + 1, embed_dim) + self.conv_xt_1 = nn.Conv1d(in_channels=1000, out_channels=n_filters, kernel_size=8) + self.fc1_xt = nn.Linear(32*121, output_dim) + # Combined layers + self.fc1 = nn.Linear(256, 1024) + self.fc2 = nn.Linear(1024, 256) + self.out = nn.Linear(256, self.n_output) # n_output = 1 for regression task + + def forward(self, data): + """tbd.""" + # Get graph input + x, edge_index, batch = data.x, data.edge_index, data.batch + # Get protein input + target = data.target + + x = F.relu(self.conv1(x, edge_index)) + x = self.bn1(x) + x = F.relu(self.conv2(x, edge_index)) + x = self.bn2(x) + x = F.relu(self.conv3(x, edge_index)) + x = self.bn3(x) + x = F.relu(self.conv4(x, edge_index)) + x = self.bn4(x) + x = F.relu(self.conv5(x, edge_index)) + x = self.bn5(x) + x = global_add_pool(x, batch) + x = F.relu(self.fc1_xd(x)) + x = F.dropout(x, p=0.2, training=self.training) + + embedded_xt = self.embedding_xt(target) + conv_xt = self.conv_xt_1(embedded_xt) + # Flatten + xt = conv_xt.view(-1, 32 * 121) + xt = self.fc1_xt(xt) + # Concat + xc = torch.cat((x, xt), 1) + # Add some dense layers + xc = self.fc1(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + xc = self.fc2(xc) + xc = self.relu(xc) + xc = self.dropout(xc) + out = self.out(xc) + return out diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/preprocess.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/preprocess.py new file mode 100644 index 00000000..3c4f5dea --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/preprocess.py @@ -0,0 +1,110 @@ +"""Preprocessing scripts for GraphDTA.""" + +import pandas as pd +import numpy as np +import os +import rdkit +import sklearn +import torch +import json,pickle +from collections import OrderedDict +from rdkit import Chem +from rdkit.Chem import MolFromSmiles +import networkx as nx +from utils import * + +# Global setting +seq_voc = "ABCDEFGHIKLMNOPQRSTUVWXYZ" +seq_dict = {v:(i+1) for i,v in enumerate(seq_voc)} +seq_dict_len = len(seq_dict) +max_seq_len = 1000 + +def one_of_k_encoding(x, allowable_set): + """tbd.""" + if x not in allowable_set: + raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set)) + return list(map(lambda s: x == s, allowable_set)) + +def one_of_k_encoding_unk(x, allowable_set): + """Maps inputs not in the allowable set to the last element.""" + if x not in allowable_set: + x = allowable_set[-1] + return list(map(lambda s: x == s, allowable_set)) + +def atom_features(atom): + """Atom feat.""" + return np.array(one_of_k_encoding_unk(atom.GetSymbol(),['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na','Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb','Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H','Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr','Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) + + one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) + + one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) + + one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) + + [atom.GetIsAromatic()]) + +def smile_to_graph(smile): + """SMILES to graph.""" + mol = Chem.MolFromSmiles(smile) + c_size = mol.GetNumAtoms() + + features = [] + for atom in mol.GetAtoms(): + feature = atom_features(atom) + features.append( feature / sum(feature) ) + edges = [] + for bond in mol.GetBonds(): + edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) + g = nx.Graph(edges).to_directed() + edge_index = [] + for e1, e2 in g.edges: + edge_index.append([e1, e2]) + return c_size, features, edge_index + +def seq_cat(prot): + """tbd.""" + x = np.zeros(max_seq_len) + for i, ch in enumerate(prot[:max_seq_len]): + x[i] = seq_dict[ch] + return x + +def process_data(df): + """Process data.""" + pairs=[] + i = 0 + for _,row in df.iterrows(): + try: + pair = [] + lg = Chem.MolToSmiles(Chem.MolFromSmiles(row[1]), isomericSmiles=True) # smiles + pair.append(lg) + pair.append(seq_cat(row[0])) + pair.append(row[4]) # label + pair.append(row[2]) # target name + pairs.append(pair) + except: + i += 1 + + print('discard {} SMILES'.format(i)) + pairs=pd.DataFrame(pairs) + #Drug + compound_iso_smiles = pairs.iloc[:,0] + compound_iso_smiles = set(compound_iso_smiles) + smile_graph = {} + outlier_smiles = [] + for smile in compound_iso_smiles: + g = smile_to_graph(smile) + smile_graph[smile] = g + _, _, edge_index = g + edge_index=torch.LongTensor(edge_index) + if len(edge_index.shape) == 1: + outlier_smiles.append(smile) + print('we discard smiles sequence : {}'.format(outlier_smiles)) + + train_drugs, train_prots, train_Y, target_name= list(pairs.iloc[:,0]),list(pairs.iloc[:,1]),list(pairs.iloc[:,2]), list(pairs.iloc[:,3]) + target_name, train_drugs, train_prots, train_Y = np.asarray(target_name), np.asarray(train_drugs), np.asarray(train_prots), np.asarray(train_Y) + mask = np.full(len(train_drugs),True) + for i in outlier_smiles: + temp = train_drugs != i + mask = mask & temp + + target_name = target_name[mask] + train_drugs = train_drugs[mask] + train_prots = train_prots[mask] + train_Y = train_Y[mask] + return (target_name, train_drugs, train_prots, train_Y, smile_graph) \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/processing.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/processing.py new file mode 100644 index 00000000..c350151e --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/processing.py @@ -0,0 +1,91 @@ +"""Preprocessing scripts for GraphDTA.""" + +import pandas as pd +import numpy as np +import os +import rdkit +import sklearn +import torch +import json,pickle +from collections import OrderedDict +from rdkit import Chem +from rdkit.Chem import MolFromSmiles +import networkx as nx +from utils import * + +# Global setting +seq_voc = "ABCDEFGHIKLMNOPQRSTUVWXYZ" +seq_dict = {v:(i+1) for i,v in enumerate(seq_voc)} +seq_dict_len = len(seq_dict) +max_seq_len = 1000 + +def one_of_k_encoding(x, allowable_set): + """tbd.""" + if x not in allowable_set: + raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set)) + return list(map(lambda s: x == s, allowable_set)) + +def one_of_k_encoding_unk(x, allowable_set): + """Maps inputs not in the allowable set to the last element.""" + if x not in allowable_set: + x = allowable_set[-1] + return list(map(lambda s: x == s, allowable_set)) + +def atom_features(atom): + """Atom feat.""" + return np.array(one_of_k_encoding_unk(atom.GetSymbol(),['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na','Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb','Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H','Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr','Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) + + one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) + + one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) + + one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6,7,8,9,10]) + + [atom.GetIsAromatic()]) + +def smile_to_graph(smile): + """SMILES to graph.""" + mol = Chem.MolFromSmiles(smile) + c_size = mol.GetNumAtoms() + + features = [] + for atom in mol.GetAtoms(): + feature = atom_features(atom) + features.append( feature / sum(feature) ) + edges = [] + for bond in mol.GetBonds(): + edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) + g = nx.Graph(edges).to_directed() + edge_index = [] + for e1, e2 in g.edges: + edge_index.append([e1, e2]) + return c_size, features, edge_index + +def seq_cat(prot): + """tbd.""" + x = np.zeros(max_seq_len) + for i, ch in enumerate(prot[:max_seq_len]): + x[i] = seq_dict[ch] + return x + +def process_data(df, target_name): + """Process data.""" + pairs=[] + for _,row in df.iterrows(): + pair = [] + lg = Chem.MolToSmiles(Chem.MolFromSmiles(row[2]), isomericSmiles=True) + pair.append(lg) + pair.append(row[1]) + pair.append(row[3]) + pairs.append(pair) + + pairs=pd.DataFrame(pairs) + # Drug + compound_iso_smiles = pairs.iloc[:,0] + compound_iso_smiles = set(compound_iso_smiles) + smile_graph = {} + for smile in compound_iso_smiles: + g = smile_to_graph(smile) + smile_graph[smile] = g + train_drugs, train_prots, train_Y = list(pairs.iloc[:,0]),list(pairs.iloc[:,1]),list(pairs.iloc[:,2]) + XT = [seq_cat(t) for t in train_prots] + train_drugs, train_prots, train_Y = np.asarray(train_drugs), np.asarray(XT), np.asarray(train_Y) + + return (train_drugs,train_prots,train_Y,smile_graph) + diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_bindingDB.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_bindingDB.py new file mode 100644 index 00000000..65f118e7 --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_bindingDB.py @@ -0,0 +1,271 @@ +"""Training scripts for GraphDTA backbone.""" + +import rdkit +import torch +import sklearn +import numpy as np +import pandas as pd +import sys, os +import random +from random import shuffle +from time import time +from rdkit import Chem +import torch.nn as nn +from argparse import ArgumentParser + +from models.gat import GATNet +from models.gat_gcn import GAT_GCN +from models.gcn import GCNNet +from models.ginconv import GINConvNet +from utils_bindingDB import * +from preprocess import process_data + +# Set ranodm seed +seed = 1 +np.random.seed(seed) +random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = True + +# Set loss function +loss_fn = nn.MSELoss() + +# Basic settings +LOG_INTERVAL = 20 + + +# Training script +def train(model, device, train_loader, optimizer, epoch): + """Training script for GraphDTA backbone model. + + Args: + model: DeepDTA backbone model. + device: Device. + train_loader: Dataloader of training set. + optimizer: Optimizer. + epoch: Epoch. + + Returns: + loss: Ouput training loss. + """ + print('Training on {} samples...'.format(len(train_loader.dataset))) + model.train() + for batch_idx, data in enumerate(train_loader): + data = data.to(device) + optimizer.zero_grad() + output = model(data) + loss = loss_fn(output, data.y.view(-1, 1).float().to(device)) + loss.backward() + optimizer.step() + if batch_idx % LOG_INTERVAL == 0: + print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, + batch_idx * len(data.x), + len(train_loader.dataset), + 100. * batch_idx / len(train_loader), + loss.item())) + return loss.item() + + +def predicting(model, device, loader): + """Predicting script for GraphDTA backbone model. + + Args: + model: GraphDTA backbone model. + device: Device. + loader: Dataloader of validation/testing set. + + Returns: + res_label: Output ground truth label. + res_pred: Output prediction. + res_group: Output groups. + """ + model.eval() + total_preds = torch.Tensor() + total_labels = torch.Tensor() + total_groups = torch.Tensor() + print('Make prediction for {} samples...'.format(len(loader.dataset))) + with torch.no_grad(): + for data in loader: + data = data.to(device) + output = model(data) + res_pred = torch.cat((total_preds, output.cpu()), 0) + res_label = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0) + res_group = torch.cat((total_groups, data.g.view(-1, 1).cpu()), 0) + return res_label.numpy().flatten(), res_pred.numpy().flatten(), res_group.numpy().flatten() + + +def main(args): + """Main function.""" + # Basic settings + best_ci = 0 + best_epoch = 0 + best_train_loss = 10000 + rounds = args.rounds + + # Set CUDA device + cuda_name = "cuda:" + str(args.cudanum) + device = torch.device(cuda_name if torch.cuda.is_available() else "cpu") + + # Modeling... + modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model] + model_st = modeling.__name__ + print(model_st) + model = modeling().to(device) + + # Optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam + + # Load data + train_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv") + val_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv") + test_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv") + + train_set = process_data(train_data) + val_set = process_data(val_data) + test_set = process_data(test_data) + + train_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_train', groups=train_set[0], xd = train_set[1], + xt = train_set[2], y = train_set[3], smile_graph = train_set[4]) + val_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_val', groups=val_set[0], xd = val_set[1], + xt = val_set[2], y = val_set[3], smile_graph = val_set[4]) + test_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_test', groups=test_set[0], xd = test_set[1], + xt = test_set[2], y = test_set[3], smile_graph = test_set[4]) + + # Make mini-batch processing + train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True) + val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False) + test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False) + + # Training... + print("Training.....") + for epoch in range(args.epochs): + print("===============Go for Training===============") + train_loss = train(model, device, train_loader, optimizer, epoch+1) + + # Validation... + G, P, group_li = predicting(model, device, val_loader) + val_ci = ci(G, P) + + # Get length of validation set + result = {} + for gl in group_li: + if result.get(gl) == None: + result[gl] = 1 + else: + result[gl] += 1 + + lens = [] + lens.extend(result.values()) + + # Skip len=1 data + k = 0 + new_G, new_P, new_lens = [], [], [] + for ll in lens: + if ll == 1: + k += 1 + else: + new_G.extend(G[k:k+ll]) + new_P.extend(P[k:k+ll]) + new_lens.append(ll) + k += ll + new_G, new_P = np.array(new_G), np.array(new_P) + + # Calculate Weighted CI, Average CI of validation set + s = 0 + w_ci,a_ci = [],[] + for l in new_lens: + try: + w_ci.append(l*ci(new_G[s:s+l],new_P[s:s+l])) + a_ci.append(ci(new_G[s:s+l],new_P[s:s+l])) + except: + pass + s += l + weight_ci, average_ci = np.sum(w_ci)/np.sum(new_lens), np.mean(a_ci) + print("===============Go for Validation===============") + print("Weighted CI:",weight_ci) + print("Average CI:",average_ci) + print("Overall CI:",val_ci) + + files = open("bestResult/GraphDTA_"+model_st+"_BindingDB_ki_result"+str(args.rounds)+".txt",'a') + files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(weight_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n') + model_name = "bestModel/GraphDTA_"+model_st+"_BindingDB_ki_"+str(rounds)+".model" + + # Save the best result + if average_ci > best_ci: + best_ci = average_ci + best_epoch = epoch + best_train_loss = train_loss + # Save best model + print("Saving the best model...") + torch.save(model.state_dict(), model_name) + + print("===============Go for Testing===============") + # Load the model + model.load_state_dict(torch.load(model_name)) + + # Testing... + test_G, test_P, test_group_li = predicting(model, device, test_loader) + test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P) + + # Get length of testing set + t_result = {} + for t_gl in test_group_li: + if t_result.get(t_gl)==None: + t_result[t_gl]=1 + else: + t_result[t_gl]+=1 + + t_lens = [] + t_lens.extend(t_result.values()) + + # Skip len=1 data + t_k = 0 + t_new_G,t_new_P,t_new_lens = [],[],[] + for t_ll in t_lens: + if t_ll == 1: + t_k += 1 + else: + t_new_G.extend(test_G[t_k:t_k+t_ll]) + t_new_P.extend(test_P[t_k:t_k+t_ll]) + t_new_lens.append(t_ll) + t_k += t_ll + t_new_G, t_new_P = np.array(t_new_G), np.array(t_new_P) + + # Calculate Weighted CI, Average CI of testing set + t_s = 0 + t_w_ci,t_a_ci = [],[] + for t_l in t_new_lens: + try: + t_w_ci.append(t_l*ci(t_new_G[t_s:t_s+t_l],t_new_P[t_s:t_s+t_l])) + t_a_ci.append(ci(t_new_G[t_s:t_s+t_l],t_new_P[t_s:t_s+t_l])) + except: + pass + t_s += t_l + test_weight_ci, test_average_ci = np.sum(t_w_ci)/np.sum(t_new_lens), np.mean(t_a_ci) + + # Save the testing result + files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + + str(test_average_ci) + ", test_weightedCI:" + str(test_weight_ci) + ", test_overallCI:" + str(test_CI) + "\n") + files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n") + + +if __name__ == "__main__": + parser = ArgumentParser(description='Starting...') + + parser.add_argument('--batchsize', default=512, type=int, metavar='N', help='Batch size') + parser.add_argument('--epochs', default=50, type=int, metavar='N', help='Number of total epochs') + parser.add_argument('--rounds', default=1, type=int, metavar='N', help='The Nth round') + parser.add_argument('--lr', default=5e-4, type=float, metavar='LR', help='Initial learning rate', dest='lr') + parser.add_argument('--cudanum', default=0, type=int, metavar='N', help='The Nth CUDA device') + parser.add_argument('--model', default=0, type=int, metavar='N', help='Select from GINConvNet, GATNet, GAT_GCN, GCNNet') + + args = parser.parse_args() + + beginT = time() + print("Starting Time: {}".format(beginT)) + main(args) + endT = time() + print("Ending Time: {}".format(endT)) + print("Duration is: {}".format(endT - beginT)) \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_davis.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_davis.py new file mode 100644 index 00000000..ad138e4d --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_davis.py @@ -0,0 +1,205 @@ +"""Training scripts for GraphDTA backbone.""" + +import rdkit +import torch +import sklearn +import numpy as np +import pandas as pd +import sys, os +import random +from random import shuffle +from time import time +from rdkit import Chem +import torch.nn as nn +from argparse import ArgumentParser + +from models.gat import GATNet +from models.gat_gcn import GAT_GCN +from models.gcn import GCNNet +from models.ginconv import GINConvNet +from utils import * +from processing import process_data + +# Set ranodm seed +seed = 1 +np.random.seed(seed) +random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = True + +# Set loss function +loss_fn = nn.MSELoss() + +# Basic settings +LOG_INTERVAL = 20 + + +# Training script +def train(model, device, train_loader, optimizer, epoch): + """Training script for GraphDTA backbone model. + + Args: + model: DeepDTA backbone model. + device: Device. + train_loader: Dataloader of training set. + optimizer: Optimizer. + epoch: Epoch. + + Returns: + loss: Ouput training loss. + """ + print('Training on {} samples...'.format(len(train_loader.dataset))) + model.train() + for batch_idx, data in enumerate(train_loader): + data = data.to(device) + optimizer.zero_grad() + output = model(data) + loss = loss_fn(output, data.y.view(-1, 1).float().to(device)) + loss.backward() + optimizer.step() + if batch_idx % LOG_INTERVAL == 0: + print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, + batch_idx * len(data.x), + len(train_loader.dataset), + 100. * batch_idx / len(train_loader), + loss.item())) + return loss.item() + + +def predicting(model, device, loader): + """Predicting script for GraphDTA backbone model. + + Args: + model: GraphDTA backbone model. + device: Device. + loader: Dataloader of validation/testing set. + + Returns: + res_label: Output ground truth label. + res_pred: Output prediction. + """ + model.eval() + total_preds = torch.Tensor() + total_labels = torch.Tensor() + print('Make prediction for {} samples...'.format(len(loader.dataset))) + with torch.no_grad(): + for data in loader: + data = data.to(device) + output = model(data) + res_pred = torch.cat((total_preds, output.cpu()), 0) + res_label = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0) + return res_label.numpy().flatten(),res_pred.numpy().flatten() + + +def main(args): + """Main function.""" + # Basic settings + best_ci = 0 + best_epoch = 0 + best_train_loss = 10000 + rounds = args.rounds + + # Set CUDA device + cuda_name = "cuda:" + str(args.cudanum) + device = torch.device(cuda_name if torch.cuda.is_available() else "cpu") + + # Modeling... + modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model] + model_st = modeling.__name__ + print(model_st) + model = modeling().to(device) + + # Optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam + + # Load data + train_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv") + val_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv") + test_data = pd.read_csv("../../Data/DAVIS/test_DAVIS_unseenP_seenD.csv") + + train_set = process_data(train_data, 'train') + val_set = process_data(val_data, 'val') + test_set = process_data(test_data, 'test') + + train_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_train' + str(rounds), xd = train_set[0], + xt = train_set[1], y = train_set[2], smile_graph = train_set[3]) + val_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_val' + str(rounds), xd = val_set[0], + xt = val_set[1], y = val_set[2], smile_graph = val_set[3]) + test_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_test', xd = test_set[0], + xt = test_set[1], y = test_set[2], smile_graph = test_set[3]) + + # Make mini-batch processing + train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True) + val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False) + test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False) + + # Training... + print("Training.....") + for epoch in range(args.epochs): + print("===============Go for Training===============") + train_loss = train(model, device, train_loader, optimizer, epoch+1) + + # Validation... + G, P = predicting(model, device, val_loader) + val_ci = ci(G, P) + + # Calculate Weighted CI, Average CI of validation set + lens = int(len(G)/68) + average_ci = np.mean([ci(G[x*68:(x+1)*68],P[x*68:(x+1)*68]) for x in range(0,lens)]) + + print("===============Go for Validation===============") + print("Weighted CI:",average_ci) + print("Average CI:",average_ci) + print("Overall CI:",val_ci) + + files = open("bestResult/GraphDTA_"+model_st+"_davis_result"+str(args.rounds)+".txt",'a') + files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(average_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n') + model_name = "bestModel/GraphDTA_"+model_st+"_davis_"+str(rounds)+".model" + + # Save the best result + if average_ci > best_ci: + best_ci = average_ci + best_epoch = epoch + best_train_loss = train_loss + # Save best model + print("Saving the best model...") + torch.save(model.state_dict(), model_name) + + print("===============Go for Testing===============") + # Load the model + model.load_state_dict(torch.load(model_name)) + + # Testing... + test_G, test_P = predicting(model, device, test_loader) + test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P) + + # Calculate Weighted CI, Average CI of testing set + t_lens = int(len(test_G)/68) + test_average_ci = np.mean([ci(test_G[x*68:(x+1)*68],test_P[x*68:(x+1)*68]) for x in range(0,t_lens)]) + + # Save the testing result + files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + + str(test_average_ci) + ", test_weightedCI:" + str(test_average_ci) + ", test_overallCI:" + str(test_CI) + "\n") + files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n") + + +if __name__ == "__main__": + parser = ArgumentParser(description='Starting...') + + parser.add_argument('--batchsize', default=512, type=int, metavar='N', help='Batch size') + parser.add_argument('--epochs', default=100, type=int, metavar='N', help='Number of total epochs') + parser.add_argument('--rounds', default=1, type=int, metavar='N', help='The Nth round') + parser.add_argument('--lr', default=5e-4, type=float, metavar='LR', help='Initial learning rate', dest='lr') + parser.add_argument('--cudanum', default=0, type=int, metavar='N', help='The Nth CUDA device') + parser.add_argument('--model', default=0, type=int, metavar='N', help='Select from GINConvNet, GATNet, GAT_GCN, GCNNet') + + args = parser.parse_args() + + beginT = time() + print("Starting Time: {}".format(beginT)) + main(args) + endT = time() + print("Ending Time: {}".format(endT)) + print("Duration is: {}".format(endT - beginT)) \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_kiba.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_kiba.py new file mode 100644 index 00000000..a164db23 --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/train_kiba.py @@ -0,0 +1,245 @@ +"""Training scripts for GraphDTA backbone.""" + +import rdkit +import torch +import sklearn +import numpy as np +import pandas as pd +import sys, os +import os.path +from os import path +import random +from random import shuffle +from time import time +from rdkit import Chem +import torch.nn as nn +from argparse import ArgumentParser + +from models.gat import GATNet +from models.gat_gcn import GAT_GCN +from models.gcn import GCNNet +from models.ginconv import GINConvNet +from utils import * +from processing import process_data +from get_len import get_kiba_len + +# Set ranodm seed +seed = 1 +np.random.seed(seed) +random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = True + +# Set loss function +loss_fn = nn.MSELoss() + +# Basic settings +LOG_INTERVAL = 20 + + +# Training script +def train(model, device, train_loader, optimizer, epoch): + """Training script for GraphDTA backbone model. + + Args: + model: DeepDTA backbone model. + device: Device. + train_loader: Dataloader of training set. + optimizer: Optimizer. + epoch: Epoch. + + Returns: + loss: Ouput training loss. + """ + print('Training on {} samples...'.format(len(train_loader.dataset))) + model.train() + for batch_idx, data in enumerate(train_loader): + data = data.to(device) + optimizer.zero_grad() + output = model(data) + loss = loss_fn(output, data.y.view(-1, 1).float().to(device)) + loss.backward() + optimizer.step() + if batch_idx % LOG_INTERVAL == 0: + print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, + batch_idx * len(data.x), + len(train_loader.dataset), + 100. * batch_idx / len(train_loader), + loss.item())) + return loss.item() + + +def predicting(model, device, loader): + """Predicting script for GraphDTA backbone model. + + Args: + model: GraphDTA backbone model. + device: Device. + loader: Dataloader of validation/testing set. + + Returns: + res_label: Output ground truth label. + res_pred: Output prediction. + """ + model.eval() + total_preds = torch.Tensor() + total_labels = torch.Tensor() + print('Make prediction for {} samples...'.format(len(loader.dataset))) + with torch.no_grad(): + for data in loader: + data = data.to(device) + output = model(data) + res_pred = torch.cat((total_preds, output.cpu()), 0) + res_label = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0) + return res_label.numpy().flatten(),res_pred.numpy().flatten() + + +def cal_len(path): + """Calculate length of each group.""" + lines = open(path,'r').readlines() + li = [] + for line in lines: + li.append(int(line.strip())) + lens = np.sum(li) + return li, lens + + +def main(args): + """Main function.""" + # Basic settings + best_ci = 0 + best_epoch = 0 + best_train_loss = 10000 + rounds = args.rounds + + # Set CUDA device + cuda_name = "cuda:" + str(args.cudanum) + device = torch.device(cuda_name if torch.cuda.is_available() else "cpu") + + # Modeling... + modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model] + model_st = modeling.__name__ + print(model_st) + model = modeling().to(device) + + # Optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam + + # Load data + train_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv") + val_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv") + test_data = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") + + train_set = process_data(train_data, 'train') + val_set = process_data(val_data, 'val') + test_set = process_data(test_data, 'test') + + train_generator = TestbedDataset(root = 'dataset', dataset = 'KIBA_train' + str(rounds), xd = train_set[0], + xt = train_set[1], y = train_set[2], smile_graph = train_set[3]) + val_generator = TestbedDataset(root = 'dataset', dataset = 'KIBA_val' + str(rounds), xd = val_set[0], + xt = val_set[1], y = val_set[2], smile_graph = val_set[3]) + test_generator = TestbedDataset(root = 'dataset', dataset = 'KIBA_test', xd = test_set[0], + xt = test_set[1], y = test_set[2], smile_graph = test_set[3]) + + # Make mini-batch processing + train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True) + val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False) + test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False) + + # Training... + print("Training.....") + for epoch in range(args.epochs): + print("===============Go for Training===============") + train_loss = train(model, device, train_loader, optimizer, epoch+1) + + # Validation... + G, P = predicting(model, device, val_loader) + val_ci = ci(G, P) + + val_path = "../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt" + # Check if kiba len file exists + if(path.exists(val_path) == False): + get_kiba_len() + + # Calculate Weighted CI, Average CI of validation set + li,lens = cal_len(val_path) + s = 0 + w_ci,a_ci = [],[] + for l in li: + try: + w_ci.append(l*ci(G[s:s+l],P[s:s+l])) + a_ci.append(ci(G[s:s+l],P[s:s+l])) + except: + pass + s += l + weight_ci, average_ci = np.sum(w_ci)/np.sum(li), np.mean(a_ci) + + print("===============Go for Validation===============") + print("Weighted CI:",weight_ci) + print("Average CI:",average_ci) + print("Overall CI:",val_ci) + + files = open("bestResult/GraphDTA_"+model_st+"_kiba_result"+str(args.rounds)+".txt",'a') + files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(weight_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n') + model_name = "bestModel/GraphDTA_"+model_st+"_kiba_"+str(rounds)+".model" + + # Save the best result + if average_ci > best_ci: + best_ci = average_ci + best_epoch = epoch + best_train_loss = train_loss + # Save best model + print("Saving the best model...") + torch.save(model.state_dict(), model_name) + + print("===============Go for Testing===============") + # Load the model + model.load_state_dict(torch.load(model_name)) + + # Testing... + test_G, test_P = predicting(model, device, test_loader) + test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P) + + test_path = "../../Data/KIBA/kiba_len.txt" + # Check if kiba len file exists + if(path.exists(test_path) == False): + get_kiba_len() + # Calculate Weighted CI, Average CI of testing set + t_li ,t_lens = cal_len(test_path) + s = 0 + w_ci,a_ci = [],[] + for l in t_li: + try: + w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) + a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + except: + pass + s += l + test_weight_ci, test_average_ci = np.sum(w_ci)/np.sum(t_li), np.mean(a_ci) + + # Save the testing result + files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + str(test_average_ci) + + ", test_weightedCI:" + str(test_weight_ci) + ", test_overallCI:" + str(test_CI) + "\n") + files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n") + + +if __name__ == "__main__": + parser = ArgumentParser(description='Starting...') + + parser.add_argument('--batchsize', default=512, type=int, metavar='N', help='Batch size') + parser.add_argument('--epochs', default=200, type=int, metavar='N', help='Number of total epochs') + parser.add_argument('--rounds', default=1, type=int, metavar='N', help='The Nth round') + parser.add_argument('--lr', default=5e-4, type=float, metavar='LR', help='Initial learning rate', dest='lr') + parser.add_argument('--cudanum', default=0, type=int, metavar='N', help='The Nth CUDA device') + parser.add_argument('--model', default=0, type=int, metavar='N', help='Select from GINConvNet, GATNet, GAT_GCN, GCNNet') + + args = parser.parse_args() + + beginT = time() + print("Starting Time: {}".format(beginT)) + main(args) + endT = time() + print("Ending Time: {}".format(endT)) + print("Duration is: {}".format(endT - beginT)) \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils.py new file mode 100644 index 00000000..e392ea3e --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils.py @@ -0,0 +1,125 @@ +"""Utils scripts for GraphDTA.""" + +import os +import numpy as np +from math import sqrt +from scipy import stats +from torch_geometric.data import InMemoryDataset, DataLoader +from torch_geometric import data as DATA +import torch + + +class TestbedDataset(InMemoryDataset): + """TestbedDataset.""" + def __init__(self, root='/tmp', dataset='DAVIS', + xd=None, xt=None, y=None, transform=None, + pre_transform=None,smile_graph=None): + # Root is required for save preprocessed data, default is '/tmp' + super(TestbedDataset, self).__init__(root, transform, pre_transform) + self.dataset = dataset + if os.path.isfile(self.processed_paths[0]): + print('Pre-processed data found: {}, loading ...'.format(self.processed_paths[0])) + self.data, self.slices = torch.load(self.processed_paths[0]) + else: + print('Pre-processed data {} not found, doing pre-processing...'.format(self.processed_paths[0])) + self.process(xd, xt, y,smile_graph) + + @property + def raw_file_names(self): + pass + + @property + def processed_file_names(self): + return [self.dataset + '.pt'] + + def download(self): + # Download to `self.raw_dir`. + pass + + def _download(self): + pass + + def _process(self): + if not os.path.exists(self.processed_dir): + os.makedirs(self.processed_dir) + + def process(self, xd, xt, y,smile_graph): + """Customize the process method to fit the task of drug-target affinity prediction. + + Args: + xd: List of SMILES. + xt: List of encoded target (categorical or one-hot). + y: List of labels. + + Returns: + PyTorch-Geometric format processed data. + """ + assert (len(xd) == len(xt) and len(xt) == len(y)), "The three lists must be the same length!" + data_list = [] + data_len = len(xd) + for i in range(data_len): + smiles = xd[i] + target = xt[i] + labels = y[i] + # Convert SMILES to molecular representation using rdkit + c_size, features, edge_index = smile_graph[smiles] + # Make the graph ready for PyTorch Geometrics GCN algorithms + GCNData = DATA.Data(x=torch.Tensor(features), + edge_index=torch.LongTensor(edge_index).transpose(1, 0), + y=torch.FloatTensor([labels])) + GCNData.target = torch.LongTensor([target]) + GCNData.__setitem__('c_size', torch.LongTensor([c_size])) + # Append graph, label and target sequence to data list + data_list.append(GCNData) + + if self.pre_filter is not None: + data_list = [data for data in data_list if self.pre_filter(data)] + + if self.pre_transform is not None: + data_list = [self.pre_transform(data) for data in data_list] + print('Graph construction done. Saving to file.') + self.data, self.slices = self.collate(data_list) + +def rmse(y,f): + """RMSE.""" + rmse = sqrt(((y - f)**2).mean(axis=0)) + return rmse + +def mse(y,f): + """MSE.""" + mse = ((y - f)**2).mean(axis=0) + return mse + +def pearson(y,f): + """Pearson.""" + rp = np.corrcoef(y, f)[0,1] + return rp + +def spearman(y,f): + """Spearman.""" + rs = stats.spearmanr(y, f)[0] + return rs + +def ci(y,f): + """CI.""" + ind = np.argsort(y) + y = y[ind] + f = f[ind] + i = len(y)-1 + j = i-1 + z = 0.0 + S = 0.0 + while i > 0: + while j >= 0: + if y[i] > y[j]: + z = z+1 + u = f[i] - f[j] + if u > 0: + S = S + 1 + elif u == 0: + S = S + 0.5 + j = j - 1 + i = i - 1 + j = i-1 + ci = S/z + return ci \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils_bindingDB.py b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils_bindingDB.py new file mode 100644 index 00000000..8fc5cd1f --- /dev/null +++ b/apps/drug_target_interaction/hybriddta/pointwise/GraphDTA/utils_bindingDB.py @@ -0,0 +1,116 @@ +"""Utils scripts for GraphDTA.""" + +import os +import numpy as np +from math import sqrt +from scipy import stats +from torch_geometric.data import InMemoryDataset, DataLoader +from torch_geometric import data as DATA +import torch + + +class TestbedDataset(InMemoryDataset): + """TestbedDataset.""" + def __init__(self, root='/tmp', dataset='BindingDB', groups=None, + xd=None, xt=None, y=None, transform=None, + pre_transform=None,smile_graph=None): + # Root is required for save preprocessed data, default is '/tmp' + super(TestbedDataset, self).__init__(root, transform, pre_transform) + self.dataset = dataset + if os.path.isfile(self.processed_paths[0]): + print('Pre-processed data found: {}, loading ...'.format(self.processed_paths[0])) + self.data, self.slices = torch.load(self.processed_paths[0]) + else: + print('Pre-processed data {} not found, doing pre-processing...'.format(self.processed_paths[0])) + self.process(groups,xd, xt, y,smile_graph) + + @property + def processed_file_names(self): + return [self.dataset + '.pt'] + + def _process(self): + if not os.path.exists(self.processed_dir): + os.makedirs(self.processed_dir) + + def process(self, groups, xd, xt, y, smile_graph): + """Customize the process method to fit the task of drug-target affinity prediction. + + Args: + xd: List of SMILES. + xt: List of encoded target (categorical or one-hot). + y: List of labels. + + Returns: + PyTorch-Geometric format processed data. + """ + assert (len(xd) == len(xt) and len(xt) == len(y)), "The three lists must be the same length!" + data_list = [] + data_len = len(xd) + for i in range(data_len): + smiles = xd[i] + target = xt[i] + labels = y[i] + group = groups[i] + # Convert SMILES to molecular representation using rdkit + c_size, features, edge_index = smile_graph[smiles] + # Make the graph ready for PyTorch Geometrics GCN algorithms + GCNData = DATA.Data(g=torch.FloatTensor([group]), + x=torch.Tensor(features), + edge_index=torch.LongTensor(edge_index).transpose(1, 0), + y=torch.FloatTensor([labels])) + GCNData.target = torch.LongTensor([target]) + GCNData.__setitem__('c_size', torch.LongTensor([c_size])) + # Append graph, label and target sequence to data list + data_list.append(GCNData) + + if self.pre_filter is not None: + data_list = [data for data in data_list if self.pre_filter(data)] + + if self.pre_transform is not None: + data_list = [self.pre_transform(data) for data in data_list] + print('Graph construction done. Saving to file.') + self.data, self.slices = self.collate(data_list) + +def rmse(y,f): + """RMSE.""" + rmse = sqrt(((y - f)**2).mean(axis=0)) + return rmse + +def mse(y,f): + """MSE.""" + mse = ((y - f)**2).mean(axis=0) + return mse + +def pearson(y,f): + """Pearson.""" + rp = np.corrcoef(y, f)[0,1] + return rp + +def spearman(y,f): + """Spearman.""" + rs = stats.spearmanr(y, f)[0] + return rs + +def ci(y,f): + """CI.""" + ind = np.argsort(y) + y = y[ind] + f = f[ind] + i = len(y)-1 + j = i-1 + z = 0.0 + S = 0.0 + while i > 0: + while j >= 0: + if y[i] > y[j]: + z = z+1 + u = f[i] - f[j] + if u > 0: + S = S + 1 + elif u == 0: + S = S + 0.5 + j = j - 1 + i = i - 1 + j = i-1 + ci = S/z + return ci \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/get_len.py b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/get_len.py index 019a73ea..a72f5240 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/get_len.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/get_len.py @@ -20,16 +20,16 @@ def get_kiba_len(): # Get length of validation set for cv in ["CV1", "CV2", "CV3", "CV4", "CV5"]: - df = pd.read_csv("../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv") + df = pd.read_csv("../../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv") df = df.groupby(['Target ID']).size().reset_index(name = 'counts') - f = open("../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a') + f = open("../../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a') for i in df['counts'].values: f.write(str(i) + "\n") # Get length of testing set - df = pd.read_csv("../Data/KIBA/test_KIBA_unseenP_seenD.csv") + df = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") df = df.groupby(['Target ID']).size().reset_index(name = 'counts') - f = open("../Data/KIBA/kiba_len.txt",'a') + f = open("../../Data/KIBA/kiba_len.txt",'a') for i in df['counts'].values: f.write(str(i) + "\n") \ No newline at end of file diff --git a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_bindingdb.py b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_bindingdb.py index 7b41d60f..8d3d81ed 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_bindingdb.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_bindingdb.py @@ -113,9 +113,9 @@ def main(args): optim = utils.Adam(parameters=model.parameters(), learning_rate=args.lr) # Adam # Load raw data - train_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv") - val_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv") - test_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv") + train_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv") + val_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv") + test_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv") train_set = BindingDB_Encoder(train_data.index.values, train_data) val_set = BindingDB_Encoder(val_data.index.values, val_data) @@ -180,8 +180,8 @@ def main(args): model_name = "bestModel/MolTrans_BindingDB_ki_"+str(rounds)+".model" # Save the best result - if weight_ci > best_ci: - best_ci = weight_ci + if average_ci > best_ci: + best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model diff --git a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_davis.py b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_davis.py index 69625e6e..65d554df 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_davis.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_davis.py @@ -109,9 +109,9 @@ def main(args): optim = utils.Adam(parameters=model.parameters(), learning_rate=args.lr) # Adam # Load raw data - train_data = pd.read_csv("../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv") - val_data = pd.read_csv("../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv") - test_data = pd.read_csv("../Data/DAVIS/test_DAVIS_unseenP_seenD.csv") + train_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv") + val_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv") + test_data = pd.read_csv("../../Data/DAVIS/test_DAVIS_unseenP_seenD.csv") train_set = Basic_Encoder(train_data.index.values, train_data) val_set = Basic_Encoder(val_data.index.values, val_data) diff --git a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_kiba.py b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_kiba.py index c969a613..8f30b41b 100644 --- a/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_kiba.py +++ b/apps/drug_target_interaction/hybriddta/pointwise/Moltrans/train_kiba.py @@ -123,9 +123,9 @@ def main(args): optim = utils.Adam(parameters=model.parameters(), learning_rate=args.lr) # Adam # Load raw data - train_data = pd.read_csv("../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv") - val_data = pd.read_csv("../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv") - test_data = pd.read_csv("../Data/KIBA/test_KIBA_unseenP_seenD.csv") + train_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv") + val_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv") + test_data = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") train_set = Basic_Encoder(train_data.index.values, train_data) val_set = Basic_Encoder(val_data.index.values, val_data) @@ -145,7 +145,7 @@ def main(args): G, P = predicting(model, val_loader) val_ci = concordance_index(G,P) - val_path = "../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt" + val_path = "../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt" # Check if kiba len file exists if(path.exists(val_path) == False): get_kiba_len() @@ -155,8 +155,11 @@ def main(args): s = 0 w_ci,a_ci = [],[] for l in li: - w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) - a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + try: + w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) + a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + except: + pass s += l weight_ci, average_ci = np.sum(w_ci)/lens, np.mean(a_ci) @@ -170,8 +173,8 @@ def main(args): model_name = "bestModel/MolTrans_kiba_"+str(rounds)+".model" # Save the best result - if weight_ci > best_ci: - best_ci = weight_ci + if average_ci > best_ci: + best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model @@ -188,13 +191,16 @@ def main(args): test_CI,test_MSE = concordance_index(test_G,test_P), mse(test_G,test_P) # Calculate Weighted CI, Average CI of testing set - test_path = "../Data/KIBA/kiba_len.txt" + test_path = "../../Data/KIBA/kiba_len.txt" t_li ,t_lens = cal_len(test_path) s = 0 w_ci,a_ci = [],[] for l in t_li: - w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) - a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + try: + w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l])) + a_ci.append(concordance_index(G[s:s+l],P[s:s+l])) + except: + pass s += l test_weight_ci, test_average_ci = np.sum(w_ci)/t_lens, np.mean(a_ci)