Skip to content

Commit

Permalink
update hybriddta
Browse files Browse the repository at this point in the history
  • Loading branch information
SuperXiang committed Dec 1, 2021
1 parent cc16b40 commit 39a64a7
Show file tree
Hide file tree
Showing 21 changed files with 1,606 additions and 50 deletions.
34 changes: 32 additions & 2 deletions apps/drug_target_interaction/hybriddta/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# HybridDTA

Source code for paper: "HybridDTA: Hybrid Data Fusion through Pairwise Training for Drug-Target Affinity Prediction".
Source code for paper: "HybridDTA: Hybrid Data Fusion through Pairwise Training for Drug-Target Affinity Prediction". The preprint version is currently released on [bioRxiv](https://www.biorxiv.org/content/10.1101/2021.11.23.469641v1).

## Backgrounds

Expand Down Expand Up @@ -72,7 +72,7 @@ python run_pairwise_Moltrans_bindingDB.py --data_path '../../Data/' "--is_mixed


### Baseline
We reimplement and provide all the baseline backbone models as following.
We reproduce and provide all the baseline backbone models as following.

#### DeepDTA
```bash
Expand All @@ -95,6 +95,18 @@ CUDA_VISIBLE_DEVICES=0 python train_bindingdb.py --batchsize 256 --epochs 50 --r
```bash
cd ./pointwise/GraphDTA
```
##### run the training script for Davis with cross-validation
```bash
python train_davis.py --batchsize 512 --epochs 100 --rounds 1 --lr 5e-4 --cudanum 0 --model 2
```
##### run the training script for KIBA with cross-validation
```bash
python train_kiba.py --batchsize 512 --epochs 200 --rounds 1 --lr 5e-4 --cudanum 0 --model 2
```
##### run the training script for BindingDB
```bash
python train_bindingdb.py --batchsize 512 --epochs 50 --rounds 1 --lr 5e-4 --cudanum 0 --model 2
```

#### Moltrans
```bash
Expand All @@ -114,6 +126,24 @@ CUDA_VISIBLE_DEVICES=0 python train_bindingdb.py --batchsize 64 --epochs 50 --ro
```


## Citation

If you find our work is helpful in your research, please cite:
```bibtex
@article {Luo2021.11.23.469641,
author = {Luo, Hongyu and Xiang, Yingfei and Fang, Xiaomin and Lin, Wei and Wang, Fan and Wu, Hua and Wang, Haifeng},
title = {HybridDTA: Hybrid Data Fusion through Pairwise Training for Drug-Target Affinity Prediction},
elocation-id = {2021.11.23.469641},
year = {2021},
doi = {10.1101/2021.11.23.469641},
publisher = {Cold Spring Harbor Laboratory},
URL = {https://www.biorxiv.org/content/early/2021/11/23/2021.11.23.469641},
eprint = {https://www.biorxiv.org/content/early/2021/11/23/2021.11.23.469641.full.pdf},
journal = {bioRxiv}
}
```


## Reference

**DAVIS**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
def get_kiba_len():
# Get length of validation set
for cv in ["CV1", "CV2", "CV3", "CV4", "CV5"]:
df = pd.read_csv("../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv")
df = pd.read_csv("../../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv")
df = df.groupby(['Target ID']).size().reset_index(name = 'counts')
f = open("../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a')
f = open("../../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a')
for i in df['counts'].values:
f.write(str(i) + "\n")


# Get length of testing set
df = pd.read_csv("../Data/KIBA/test_KIBA_unseenP_seenD.csv")
df = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv")
df = df.groupby(['Target ID']).size().reset_index(name = 'counts')
f = open("../Data/KIBA/kiba_len.txt",'a')
f = open("../../Data/KIBA/kiba_len.txt",'a')
for i in df['counts'].values:
f.write(str(i) + "\n")
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ def main(args):
optim = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate = args.lr) # Adam

# Load raw data
train_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv")
val_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv")
test_data = pd.read_csv("../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv")
train_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv")
val_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv")
test_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv")

train_set = BindingDB_Encoder(train_data.index.values, train_data)
val_set = BindingDB_Encoder(val_data.index.values, val_data)
Expand Down Expand Up @@ -176,8 +176,8 @@ def main(args):
model_name = "bestModel/DeepDTA_BindingDB_ki_"+str(rounds)+".model"

# Save the best result
if weight_ci > best_ci:
best_ci = weight_ci
if average_ci > best_ci:
best_ci = average_ci
best_epoch = epoch
best_train_loss = train_loss
# Save best model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ def main(args):
optim = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate = args.lr) # Adam

# Load raw data
train_data = pd.read_csv("../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv")
val_data = pd.read_csv("../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv")
test_data = pd.read_csv("../Data/DAVIS/test_DAVIS_unseenP_seenD.csv")
train_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv")
val_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv")
test_data = pd.read_csv("../../Data/DAVIS/test_DAVIS_unseenP_seenD.csv")

train_set = Basic_Encoder(train_data.index.values, train_data)
val_set = Basic_Encoder(val_data.index.values, val_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ def main(args):
optim = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate = args.lr) # Adam

# Load raw data
train_data = pd.read_csv("../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv")
val_data = pd.read_csv("../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv")
test_data = pd.read_csv("../Data/KIBA/test_KIBA_unseenP_seenD.csv")
train_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_train.csv")
val_data = pd.read_csv("../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_KIBA_unseenP_seenD_val.csv")
test_data = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv")

train_set = Basic_Encoder(train_data.index.values, train_data)
val_set = Basic_Encoder(val_data.index.values, val_data)
Expand All @@ -138,18 +138,21 @@ def main(args):
G, P = predicting(model, val_loader)
val_ci = concordance_index(G,P)

val_path = "../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt"
val_path = "../../Data/KIBA/CV"+str(rounds)+"/CV"+str(rounds)+"_val.txt"
# Check if kiba len file exists
if(path.exists(val_path) == False):
get_kiba_len()

# Calculate Weighted CI, Average CI of validation set
li ,lens = cal_len(val_path)
li, lens = cal_len(val_path)
s = 0
w_ci,a_ci = [],[]
for l in li:
w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l]))
a_ci.append(concordance_index(G[s:s+l],P[s:s+l]))
try:
w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l]))
a_ci.append(concordance_index(G[s:s+l],P[s:s+l]))
except:
pass
s += l
weight_ci, average_ci = np.sum(w_ci)/lens, np.mean(a_ci)

Expand All @@ -163,8 +166,8 @@ def main(args):
model_name = "bestModel/MolTrans_kiba_"+str(rounds)+".model"

# Save the best result
if weight_ci > best_ci:
best_ci = weight_ci
if average_ci > best_ci:
best_ci = average_ci
best_epoch = epoch
best_train_loss = train_loss
# Save best model
Expand All @@ -180,17 +183,20 @@ def main(args):
test_G, test_P = predicting(model, test_loader)
test_CI,test_MSE = concordance_index(test_G,test_P), mse(test_G,test_P)

test_path = "../Data/KIBA/kiba_len.txt"
test_path = "../../Data/KIBA/kiba_len.txt"
# Check if kiba len file exists
if(path.exists(test_path) == False):
get_kiba_len()
# Calculate Weighted CI, Average CI of testing set
t_li ,t_lens = cal_len(test_path)
s = 0
w_ci,a_ci = [],[]
w_ci, a_ci = [], []
for l in t_li:
w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l]))
a_ci.append(concordance_index(G[s:s+l],P[s:s+l]))
try:
w_ci.append(l*concordance_index(G[s:s+l],P[s:s+l]))
a_ci.append(concordance_index(G[s:s+l],P[s:s+l]))
except:
pass
s += l
test_weight_ci, test_average_ci = np.sum(w_ci)/t_lens, np.mean(a_ci)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Calculate length of each group in dataset."""

import pandas as pd


def get_kiba_len():
# Get length of validation set
for cv in ["CV1", "CV2", "CV3", "CV4", "CV5"]:
df = pd.read_csv("../../Data/KIBA/"+cv+"/"+cv+"_KIBA_unseenP_seenD_val.csv")
df = df.groupby(['Target ID']).size().reset_index(name = 'counts')
f = open("../../Data/KIBA/"+cv+"/"+cv+"_val.txt",'a')
for i in df['counts'].values:
f.write(str(i) + "\n")


# Get length of testing set
df = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv")
df = df.groupby(['Target ID']).size().reset_index(name = 'counts')
f = open("../../Data/KIBA/kiba_len.txt",'a')
for i in df['counts'].values:
f.write(str(i) + "\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""GraphDTA_GAT backbone model."""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.nn import GATConv
from torch_geometric.nn import global_max_pool as gmp


# GAT backbone model
class GATNet(torch.nn.Module):
"""GAT model.
Args:
data: Input data.
Returns:
out: Prediction results.
"""
def __init__(self, num_features_xd=78, n_output=1, num_features_xt=25,
n_filters=32, embed_dim=128, output_dim=128, dropout=0.2):
super(GATNet, self).__init__()
# Basic config
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
# SMILES graph branch
self.gcn1 = GATConv(num_features_xd, num_features_xd, heads=10, dropout=dropout)
self.gcn2 = GATConv(num_features_xd * 10, output_dim, dropout=dropout)
self.fc_g1 = nn.Linear(output_dim, output_dim)
# Protein sequence branch (1d conv)
self.embedding_xt = nn.Embedding(num_features_xt + 1, embed_dim)
self.conv_xt1 = nn.Conv1d(in_channels=1000, out_channels=n_filters, kernel_size=8)
self.fc_xt1 = nn.Linear(32*121, output_dim)
# Combined layers
self.fc1 = nn.Linear(256, 1024)
self.fc2 = nn.Linear(1024, 256)
self.out = nn.Linear(256, n_output)

def forward(self, data):
"""tbd."""
# Get graph input
x, edge_index, batch = data.x, data.edge_index, data.batch
# Get protein input
target = data.target

x = F.dropout(x, p=0.2, training=self.training)
x = F.elu(self.gcn1(x, edge_index))

x = F.dropout(x, p=0.2, training=self.training)
x = self.gcn2(x, edge_index)
x = self.relu(x)
x = gmp(x, batch) # global max pooling

x = self.fc_g1(x)
x = self.relu(x)
# 1d conv layers
embedded_xt = self.embedding_xt(target)
conv_xt = self.conv_xt1(embedded_xt)
conv_xt = self.relu(conv_xt)
# Flatten
xt = conv_xt.view(-1, 32 * 121)
xt = self.fc_xt1(xt)
# Concat
xc = torch.cat((x, xt), 1)
# Add some dense layers
xc = self.fc1(xc)
xc = self.relu(xc)
xc = self.dropout(xc)
xc = self.fc2(xc)
xc = self.relu(xc)
xc = self.dropout(xc)
out = self.out(xc)
return out
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""GraphDTA_GATGCN backbone model."""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.nn import GCNConv, GATConv, GINConv, global_add_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp


# GATGCN backbone model
class GAT_GCN(torch.nn.Module):
"""GATGCN model.
Args:
data: Input data.
Returns:
out: Prediction results.
"""
def __init__(self, n_output=1, num_features_xd=78, num_features_xt=25,
n_filters=32, embed_dim=128, output_dim=128, dropout=0.2):
super(GAT_GCN, self).__init__()
# Basic config
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
self.n_output = n_output
# SMILES graph branch
self.conv1 = GATConv(num_features_xd, num_features_xd, heads=10)
self.conv2 = GCNConv(num_features_xd*10, num_features_xd*10)
self.fc_g1 = torch.nn.Linear(num_features_xd*10*2, 1500)
self.fc_g2 = torch.nn.Linear(1500, output_dim)
# Protein sequence branch (1d conv)
self.embedding_xt = nn.Embedding(num_features_xt + 1, embed_dim)
self.conv_xt_1 = nn.Conv1d(in_channels=1000, out_channels=n_filters, kernel_size=8)
self.fc1_xt = nn.Linear(32*121, output_dim)
# Combined layers
self.fc1 = nn.Linear(256, 1024)
self.fc2 = nn.Linear(1024, 512)
self.out = nn.Linear(512, self.n_output) # n_output = 1 for regression task

def forward(self, data):
"""tbd."""
# Get graph input
x, edge_index, batch = data.x, data.edge_index, data.batch
# Get protein input
target = data.target

x = self.conv1(x, edge_index)
x = self.relu(x)
x = self.conv2(x, edge_index)
x = self.relu(x)
# Apply global max pooling (gmp) and global mean pooling (gap)
x = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
x = self.relu(self.fc_g1(x))
x = self.dropout(x)
x = self.fc_g2(x)

embedded_xt = self.embedding_xt(target)
conv_xt = self.conv_xt_1(embedded_xt)
# Flatten
xt = conv_xt.view(-1, 32 * 121)
xt = self.fc1_xt(xt)
# Concat
xc = torch.cat((x, xt), 1)
# Add some dense layers
xc = self.fc1(xc)
xc = self.relu(xc)
xc = self.dropout(xc)
xc = self.fc2(xc)
xc = self.relu(xc)
xc = self.dropout(xc)
out = self.out(xc)
return out
Loading

0 comments on commit 39a64a7

Please sign in to comment.