diff --git a/apps/pretrained_compound/pretrain_gnns/README.md b/apps/pretrained_compound/pretrain_gnns/README.md index d09ef220..1f666078 100644 --- a/apps/pretrained_compound/pretrain_gnns/README.md +++ b/apps/pretrained_compound/pretrain_gnns/README.md @@ -52,6 +52,12 @@ You can download the [pretrained models](https://baidu-nlp.bj.bcebos.com/PaddleH #### Data link You can choose to download the dataset from the [link](http://snap.stanford.edu/gnn-pretrain/data/chem_dataset.zip) provided by us and perform the corresponding preprocessing for your use. It is recommended to unzip the data set and put it in the data folder under the root directory, if not, please create a new data folder. + # cd to PaddleHelix folder + mkdir -p data + cd data + wget http://snap.stanford.edu/gnn-pretrain/data/chem_dataset.zip + unzip chem_dataset.zip + ### Training Models The training methods of the pre-training strategy we provide are divided into two aspects. The first is the pre-training at the node level. There are two methods. The second is the supervised pre-training strategy for the whole image. You can choose during the specific experiment. Perform pre-training at the node level first, and then perform the pre-training at the graph level at the entire graph level, as follows: diff --git a/apps/pretrained_compound/pretrain_gnns/README_cn.md b/apps/pretrained_compound/pretrain_gnns/README_cn.md index fdf3f682..ef77066a 100644 --- a/apps/pretrained_compound/pretrain_gnns/README_cn.md +++ b/apps/pretrained_compound/pretrain_gnns/README_cn.md @@ -50,6 +50,12 @@ #### 数据地址 您可以选择从我们提供的[网址](http://snap.stanford.edu/gnn-pretrain/data/chem_dataset.zip)上下载数据集然后进行相应的预处理来供您使用。建议解压数据集并将其放入根目录下的data文件夹中,如果没有请新建一个data文件夹。 + # cd to PaddleHelix folder + mkdir -p data + cd data + wget http://snap.stanford.edu/gnn-pretrain/data/chem_dataset.zip + unzip chem_dataset.zip + ### 模型训练 我们提供的预训练策略的训练方式分为两个方面,首先是在节点级别的预训练,一共有两种方法,其次是整图的监督预训练策略,在具体实验的过程中,你可以选择先在节点级别进行预训练,再在整图级别上进行图级别的预训练,具体模型结构图如下: diff --git a/installation_guide.md b/installation_guide.md index 65dc6556..683c40c1 100644 --- a/installation_guide.md +++ b/installation_guide.md @@ -45,8 +45,20 @@ conda install -c conda-forge rdkit ``` 5. Install `paddle` based on your choice of GPU/CPU version: -Check `paddlepaddle`'s [official document](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html) -to install **paddle2.0**. + Check `paddlepaddle`'s [official document](https://www.paddlepaddle.org.cn/documentation/docs/en/2.0-rc1/install/index_en.html) + to install **paddle2.0**. + + For example, if you want to use GPU version of PaddlePaddle on Linux, run this command: + + ```bash + python -m pip install paddlepaddle-gpu==2.0.0rc1.post90 -f https://paddlepaddle.org.cn/whl/stable.html + ``` + + Or if you want to use CPU version of PaddlePaddle on Linux, run this command: + + ```bash + python -m pip install paddlepaddle==2.0.0rc1 -i https://mirror.baidu.com/pypi/simple + ``` 6. Install `PGL` using pip: diff --git a/installation_guide_cn.md b/installation_guide_cn.md index 53a0c369..f85599ee 100644 --- a/installation_guide_cn.md +++ b/installation_guide_cn.md @@ -44,7 +44,19 @@ conda install -c conda-forge rdkit ``` 5. 基于你对 CPU/GPU 版本的选择来安装 `paddle`: -请注意安装 **paddle2.0** 以上版本,方法参见 paddlepaddle [官方文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/install/index_cn.html)。 + 请注意安装 **paddle2.0** 以上版本,方法参见 paddlepaddle [官方文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/install/index_cn.html)。 + + 比如,你想在 Linux 系统上安装 paddlepaddle 2.0 GPU 版本,你可以运行以下命令: + + ```bash + python -m pip install paddlepaddle-gpu==2.0.0rc1.post90 -f https://paddlepaddle.org.cn/whl/stable.html + ``` + + 如果你想在 Linux 系统上安装 paddlepaddle 2.0 CPU 版本,你可以运行以下命令: + + ```bash + python -m pip install paddlepaddle==2.0.0rc1 -i https://mirror.baidu.com/pypi/simple + ``` 6. 使用 pip 命令安装`PGL`: ```bash diff --git a/pahelix/__init__.py b/pahelix/__init__.py new file mode 100644 index 00000000..4c4743a0 --- /dev/null +++ b/pahelix/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Initialize. +""" + diff --git a/pahelix/datasets/__init__.py b/pahelix/datasets/__init__.py new file mode 100644 index 00000000..a471005a --- /dev/null +++ b/pahelix/datasets/__init__.py @@ -0,0 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Initialize datasets. +""" + +from pahelix.datasets.inmemory_dataset import * +from pahelix.datasets.zinc_dataset import * +from pahelix.datasets.chembl_filtered_dataset import * +from pahelix.datasets.bace_dataset import * +from pahelix.datasets.bbbp_dataset import * +from pahelix.datasets.clintox_dataset import * +from pahelix.datasets.esol_dataset import * +from pahelix.datasets.freesolv_dataset import * +from pahelix.datasets.hiv_dataset import * +from pahelix.datasets.lipophilicity_dataset import * +from pahelix.datasets.muv_dataset import * +from pahelix.datasets.sider_dataset import * +from pahelix.datasets.tox21_dataset import * +from pahelix.datasets.toxcast_dataset import * +from pahelix.datasets.mutag_dataset import * +from pahelix.datasets.ptc_mr_dataset import * diff --git a/pahelix/featurizers/__init__.py b/pahelix/featurizers/__init__.py new file mode 100644 index 00000000..b9113829 --- /dev/null +++ b/pahelix/featurizers/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Initialize featurizers. +""" + +from pahelix.featurizers.featurizer import * +from pahelix.featurizers.pretrain_gnn_featurizer import * diff --git a/pahelix/model_zoo/__init__.py b/pahelix/model_zoo/__init__.py new file mode 100644 index 00000000..e55ad2b5 --- /dev/null +++ b/pahelix/model_zoo/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +model zoo. +""" + +from pahelix.model_zoo.pretrain_gnns_model import * diff --git a/pahelix/networks/__init__.py b/pahelix/networks/__init__.py new file mode 100644 index 00000000..b6df667f --- /dev/null +++ b/pahelix/networks/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Initialize model related tools. +""" + diff --git a/pahelix/tests/__init__.py b/pahelix/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pahelix/utils/__init__.py b/pahelix/utils/__init__.py new file mode 100644 index 00000000..d072eff6 --- /dev/null +++ b/pahelix/utils/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Initialize feature related tools. +""" \ No newline at end of file diff --git a/tutorials/compound_property_prediction_tutorial.ipynb b/tutorials/compound_property_prediction_tutorial.ipynb index d4b623cf..ba6edcc9 100644 --- a/tutorials/compound_property_prediction_tutorial.ipynb +++ b/tutorials/compound_property_prediction_tutorial.ipynb @@ -44,7 +44,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[INFO] 2020-12-14 14:14:02,375 [mp_reader.py: 23]:\tujson not install, fail back to use json instead\n" + "[INFO] 2020-12-18 19:57:50,304 [mp_reader.py: 23]:\tujson not install, fail back to use json instead\n" ] } ], @@ -89,9 +89,24 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:78\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:98\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/networks/gnn_block.py:194\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -126,13 +141,49 @@ "metadata": {}, "source": [ "## Dataset loading and feature extraction\n", + "### Download the dataset using wget\n", "`PreGNNAttrMaskFeaturizer` is used along with `PreGNNAttrmaskModel`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw SMILES to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch.\n", "The zinc dataset is used as the pretraining dataset." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-18 19:58:05-- https://baidu-nlp.bj.bcebos.com/PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz\n", + "Connecting to 172.19.56.199:3128... connected.\n", + "WARNING: certificate common name “*.bcebos.com” doesn’t match requested host name “baidu-nlp.bj.bcebos.com”.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 609563 (595K) [application/gzip]\n", + "Saving to: “PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz”\n", + "\n", + "100%[======================================>] 609,563 231K/s in 2.6s \n", + "\n", + "2020-12-18 19:58:12 (231 KB/s) - “PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz” saved [609563/609563]\n", + "\n", + "tox21 zinc_standard_agent\n" + ] + } + ], + "source": [ + "### Download a toy dataset for demonstration:\n", + "!wget \"https://baidu-nlp.bj.bcebos.com/PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz\" --no-check-certificate\n", + "!tar -zxf \"PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz\"\n", + "!ls \"./chem_dataset_small\"\n", + "### Download the full dataset as you want:\n", + "# !wget \"http://snap.stanford.edu/gnn-pretrain/data/chem_dataset.zip\" --no-check-certificate\n", + "# !unzip \"chem_dataset.zip\"\n", + "# !ls \"./chem_dataset\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -148,7 +199,11 @@ " model.graph_wrapper, \n", " atom_type_num=len(CompoundConstants.atom_num_list),\n", " mask_ratio=0.15)\n", - "dataset = load_zinc_dataset(\"../../../data/chem_dataset/zinc_standard_agent/raw\", featurizer=featurizer)\n", + "### Load the first 1000 of the toy dataset for speed up\n", + "dataset = load_zinc_dataset(\"./chem_dataset_small/zinc_standard_agent/raw\", featurizer=featurizer)\n", + "dataset = dataset[:1000]\n", + "### Load the full dataset:\n", + "# dataset = load_zinc_dataset(\"./chem_dataset/zinc_standard_agent/raw\", featurizer=featurizer)\n", "print(\"dataset num: %s\" % (len(dataset)))" ] }, @@ -162,15 +217,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "epoch:0 train/loss:4.2393446\n", - "epoch:1 train/loss:1.477257\n" + "epoch:0 train/loss:0.7213694\n", + "epoch:1 train/loss:0.70324224\n" ] } ], @@ -210,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -233,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -246,7 +301,6 @@ ], "source": [ "task_names = get_default_tox21_task_names()\n", - "# task_names = get_default_sider_task_names()\n", "print(task_names)" ] }, @@ -264,16 +318,25 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/apps/pretrained_compound/pretrain_gnns/model.py:90\n", + "The behavior of expression A * B has been unified with elementwise_mul(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_mul(X, Y, axis=0) instead of A * B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n" + ] + }, { "data": { "text/plain": [ "[]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -314,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -341,15 +404,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [14:16:10] WARNING: not removing hydrogen atom without neighbors\n", - "RDKit WARNING: [14:16:25] WARNING: not removing hydrogen atom without neighbors\n" + "RDKit WARNING: [20:05:16] WARNING: not removing hydrogen atom without neighbors\n", + "RDKit WARNING: [20:05:32] WARNING: not removing hydrogen atom without neighbors\n" ] }, { @@ -362,10 +425,12 @@ ], "source": [ "featurizer = DownstreamFeaturizer(model.graph_wrapper)\n", + "### Load the toy dataset:\n", "dataset = load_tox21_dataset(\n", - " \"../../../data/chem_dataset/tox21/raw\", task_names, featurizer=featurizer)\n", - "# dataset = load_sider_dataset(\n", - "# \"../../../data/chem_dataset/sider/raw\", task_names, featurizer=featurizer)\n", + " \"./chem_dataset_small/tox21/raw\", task_names, featurizer=featurizer)\n", + "### Load the full dataset:\n", + "# dataset = load_tox21_dataset(\n", + "# \"./chem_dataset/tox21/raw\", task_names, featurizer=featurizer)\n", "\n", "# splitter = RandomSplitter()\n", "splitter = ScaffoldSplitter()\n", @@ -385,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -396,30 +461,30 @@ "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:0 train/loss:0.50505453\n", - "epoch:0 val/auc:0.619446883905476\n", - "epoch:0 test/auc:0.5755580865907087\n", + "epoch:0 train/loss:0.49475822\n", + "epoch:0 val/auc:0.6658935384290235\n", + "epoch:0 test/auc:0.6521651483374383\n", "Valid ratio: 0.7603235\n", "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:1 train/loss:0.25283575\n", - "epoch:1 val/auc:0.6492427350509836\n", - "epoch:1 test/auc:0.6505639462892321\n", + "epoch:1 train/loss:0.25100735\n", + "epoch:1 val/auc:0.6873462784930324\n", + "epoch:1 test/auc:0.6848870489412003\n", "Valid ratio: 0.7603235\n", "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:2 train/loss:0.22008401\n", - "epoch:2 val/auc:0.6877695463554699\n", - "epoch:2 test/auc:0.6832456625548606\n", + "epoch:2 train/loss:0.22103228\n", + "epoch:2 val/auc:0.6841346893995142\n", + "epoch:2 test/auc:0.6818656406099143\n", "Valid ratio: 0.7603235\n", "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:3 train/loss:0.21583365\n", - "epoch:3 val/auc:0.7055511601823229\n", - "epoch:3 test/auc:0.6873961667704048\n" + "epoch:3 train/loss:0.21569714\n", + "epoch:3 val/auc:0.6623265146967362\n", + "epoch:3 test/auc:0.6324247392008124\n" ] } ], @@ -463,7 +528,6 @@ " print(\"epoch:%s train/loss:%s\" % (epoch_id, train_loss))\n", " print(\"epoch:%s val/auc:%s\" % (epoch_id, val_auc))\n", " print(\"epoch:%s test/auc:%s\" % (epoch_id, test_auc))\n", - "# fluid.io.save_params(exe, './model/sider', train_prog)\n", "fluid.io.save_params(exe, './model/tox21', train_prog)" ] }, @@ -488,6 +552,23 @@ "execution_count": 17, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:78\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:98\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/networks/gnn_block.py:194\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n" + ] + }, { "data": { "text/plain": [ @@ -564,18 +645,18 @@ "text": [ "SMILES:O=C1c2ccccc2C(=O)C1c1ccc2cc(S(=O)(=O)[O-])cc(S(=O)(=O)[O-])c2n1\n", "Predictions:\n", - " NR-AR:\t0.017969187\n", - " NR-AR-LBD:\t0.012354077\n", - " NR-AhR:\t0.029024104\n", - " NR-Aromatase:\t0.015708463\n", - " NR-ER:\t0.08152088\n", - " NR-ER-LBD:\t0.019772632\n", - " NR-PPAR-gamma:\t0.013134609\n", - " SR-ARE:\t0.09602512\n", - " SR-ATAD5:\t0.012249073\n", - " SR-HSE:\t0.025706206\n", - " SR-MMP:\t0.058807086\n", - " SR-p53:\t0.01833228\n" + " NR-AR:\t0.07395526\n", + " NR-AR-LBD:\t0.060779173\n", + " NR-AhR:\t0.32708064\n", + " NR-Aromatase:\t0.105057806\n", + " NR-ER:\t0.23960893\n", + " NR-ER-LBD:\t0.10899512\n", + " NR-PPAR-gamma:\t0.08047223\n", + " SR-ARE:\t0.3242342\n", + " SR-ATAD5:\t0.103697956\n", + " SR-HSE:\t0.08612242\n", + " SR-MMP:\t0.321919\n", + " SR-p53:\t0.15215957\n" ] } ], @@ -613,4 +694,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/tutorials/compound_property_prediction_tutorial_cn.ipynb b/tutorials/compound_property_prediction_tutorial_cn.ipynb index 5615ff85..69415783 100644 --- a/tutorials/compound_property_prediction_tutorial_cn.ipynb +++ b/tutorials/compound_property_prediction_tutorial_cn.ipynb @@ -43,7 +43,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[INFO] 2020-12-15 17:28:43,039 [mp_reader.py: 23]:\tujson not install, fail back to use json instead\n" + "[INFO] 2020-12-18 20:18:19,496 [mp_reader.py: 23]:\tujson not install, fail back to use json instead\n" ] } ], @@ -92,6 +92,21 @@ "execution_count": 4, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:78\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:98\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/networks/gnn_block.py:194\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -130,6 +145,41 @@ "使用 `PreGNNAttrMaskFeaturizer` 来配合模型 `PreGNNAttrmaskModel`。它继承了用于特征提取的超类 `Featurizer`。`Featurizer` 有两个功能:`gen_features` 用于将一条原始 SMILES 转换为图数据,而 `collate_fn` 用于将图数据的子列表聚合为一个 batch。这里我们采用 Zinc 数据集来进行预训练。" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-18 20:18:27-- https://baidu-nlp.bj.bcebos.com/PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz\n", + "Connecting to 172.19.56.199:3128... connected.\n", + "WARNING: certificate common name “*.bcebos.com” doesn’t match requested host name “baidu-nlp.bj.bcebos.com”.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 609563 (595K) [application/gzip]\n", + "Saving to: “PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz.1”\n", + "\n", + "100%[======================================>] 609,563 266K/s in 2.2s \n", + "\n", + "2020-12-18 20:18:32 (266 KB/s) - “PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz.1” saved [609563/609563]\n", + "\n", + "tox21 zinc_standard_agent\n" + ] + } + ], + "source": [ + "### Download a toy dataset for demonstration:\n", + "!wget \"https://baidu-nlp.bj.bcebos.com/PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz\" --no-check-certificate\n", + "!tar -zxf \"PaddleHelix%2Fdatasets%2Fcompound_datasets%2Fchem_dataset_small.tgz\"\n", + "!ls \"./chem_dataset_small\"\n", + "### Download the full dataset as you want:\n", + "# !wget \"http://snap.stanford.edu/gnn-pretrain/data/chem_dataset.zip\" --no-check-certificate\n", + "# !unzip \"chem_dataset.zip\"\n", + "# !ls \"./chem_dataset\"" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -148,7 +198,11 @@ " model.graph_wrapper, \n", " atom_type_num=len(CompoundConstants.atom_num_list),\n", " mask_ratio=0.15)\n", - "dataset = load_zinc_dataset(\"../../../data/chem_dataset/zinc_standard_agent/raw\", featurizer=featurizer)\n", + "### Load the first 1000 of the toy dataset for speed up\n", + "dataset = load_zinc_dataset(\"./chem_dataset_small/zinc_standard_agent/raw\", featurizer=featurizer)\n", + "dataset = dataset[:1000]\n", + "### Load the full dataset:\n", + "# dataset = load_zinc_dataset(\"./chem_dataset/zinc_standard_agent/raw\", featurizer=featurizer)\n", "print(\"dataset num: %s\" % (len(dataset)))" ] }, @@ -163,15 +217,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "epoch:0 train/loss:4.2393446\n", - "epoch:1 train/loss:1.477257\n" + "epoch:0 train/loss:1.042354\n", + "epoch:1 train/loss:0.85617626\n" ] } ], @@ -212,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -235,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -248,7 +302,6 @@ ], "source": [ "task_names = get_default_tox21_task_names()\n", - "# task_names = get_default_sider_task_names()\n", "print(task_names)" ] }, @@ -265,16 +318,25 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/apps/pretrained_compound/pretrain_gnns/model.py:90\n", + "The behavior of expression A * B has been unified with elementwise_mul(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_mul(X, Y, axis=0) instead of A * B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n" + ] + }, { "data": { "text/plain": [ "[]" ] }, - "execution_count": 15, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -316,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -351,8 +413,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "RDKit WARNING: [14:16:10] WARNING: not removing hydrogen atom without neighbors\n", - "RDKit WARNING: [14:16:25] WARNING: not removing hydrogen atom without neighbors\n" + "RDKit WARNING: [20:23:14] WARNING: not removing hydrogen atom without neighbors\n", + "RDKit WARNING: [20:23:29] WARNING: not removing hydrogen atom without neighbors\n" ] }, { @@ -365,10 +427,12 @@ ], "source": [ "featurizer = DownstreamFeaturizer(model.graph_wrapper)\n", + "### Load the toy dataset:\n", "dataset = load_tox21_dataset(\n", - " \"../../../data/chem_dataset/tox21/raw\", task_names, featurizer=featurizer)\n", - "# dataset = load_sider_dataset(\n", - "# \"../../../data/chem_dataset/sider/raw\", task_names, featurizer=featurizer)\n", + " \"./chem_dataset_small/tox21/raw\", task_names, featurizer=featurizer)\n", + "### Load the full dataset:\n", + "# dataset = load_tox21_dataset(\n", + "# \"./chem_dataset/tox21/raw\", task_names, featurizer=featurizer)\n", "\n", "# splitter = RandomSplitter()\n", "splitter = ScaffoldSplitter()\n", @@ -389,9 +453,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -400,30 +472,30 @@ "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:0 train/loss:0.50505453\n", - "epoch:0 val/auc:0.619446883905476\n", - "epoch:0 test/auc:0.5755580865907087\n", + "epoch:0 train/loss:0.2143532\n", + "epoch:0 val/auc:0.6834385071324883\n", + "epoch:0 test/auc:0.6735986534078915\n", "Valid ratio: 0.7603235\n", "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:1 train/loss:0.25283575\n", - "epoch:1 val/auc:0.6492427350509836\n", - "epoch:1 test/auc:0.6505639462892321\n", + "epoch:1 train/loss:0.20883079\n", + "epoch:1 val/auc:0.727357536520297\n", + "epoch:1 test/auc:0.6860799064889367\n", "Valid ratio: 0.7603235\n", "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:2 train/loss:0.22008401\n", - "epoch:2 val/auc:0.6877695463554699\n", - "epoch:2 test/auc:0.6832456625548606\n", + "epoch:2 train/loss:0.20595507\n", + "epoch:2 val/auc:0.691802226943295\n", + "epoch:2 test/auc:0.6565749600201853\n", "Valid ratio: 0.7603235\n", "Task evaluated: 12/12\n", "Valid ratio: 0.7513818\n", "Task evaluated: 12/12\n", - "epoch:3 train/loss:0.21583365\n", - "epoch:3 val/auc:0.7055511601823229\n", - "epoch:3 test/auc:0.6873961667704048\n" + "epoch:3 train/loss:0.20529544\n", + "epoch:3 val/auc:0.7298949686528418\n", + "epoch:3 test/auc:0.6967547132444416\n" ] } ], @@ -467,7 +539,6 @@ " print(\"epoch:%s train/loss:%s\" % (epoch_id, train_loss))\n", " print(\"epoch:%s val/auc:%s\" % (epoch_id, val_auc))\n", " print(\"epoch:%s test/auc:%s\" % (epoch_id, test_auc))\n", - "# fluid.io.save_params(exe, './model/sider', train_prog)\n", "fluid.io.save_params(exe, './model/tox21', train_prog)" ] }, @@ -489,16 +560,33 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:78\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/model_zoo/pretrain_gnns_model.py:98\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n", + "/home/ol/anaconda2/envs/paddle2.0/lib/python3.7/site-packages/paddle/fluid/layers/math_op_patch.py:298: UserWarning: /home/ol/jieqiong/repos/PaddleHelix/tutorials/../pahelix/networks/gnn_block.py:194\n", + "The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future.\n", + " op_type, op_type, EXPRESSION_MAP[method_name]))\n" + ] + }, { "data": { "text/plain": [ "[]" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -532,7 +620,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -559,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -568,18 +656,18 @@ "text": [ "SMILES:O=C1c2ccccc2C(=O)C1c1ccc2cc(S(=O)(=O)[O-])cc(S(=O)(=O)[O-])c2n1\n", "Predictions:\n", - " NR-AR:\t0.017969187\n", - " NR-AR-LBD:\t0.012354077\n", - " NR-AhR:\t0.029024104\n", - " NR-Aromatase:\t0.015708463\n", - " NR-ER:\t0.08152088\n", - " NR-ER-LBD:\t0.019772632\n", - " NR-PPAR-gamma:\t0.013134609\n", - " SR-ARE:\t0.09602512\n", - " SR-ATAD5:\t0.012249073\n", - " SR-HSE:\t0.025706206\n", - " SR-MMP:\t0.058807086\n", - " SR-p53:\t0.01833228\n" + " NR-AR:\t0.03680832\n", + " NR-AR-LBD:\t0.026945854\n", + " NR-AhR:\t0.3054564\n", + " NR-Aromatase:\t0.067290716\n", + " NR-ER:\t0.28590235\n", + " NR-ER-LBD:\t0.07384003\n", + " NR-PPAR-gamma:\t0.02805905\n", + " SR-ARE:\t0.31503102\n", + " SR-ATAD5:\t0.056458935\n", + " SR-HSE:\t0.050735578\n", + " SR-MMP:\t0.33103985\n", + " SR-p53:\t0.09834782\n" ] } ], @@ -617,4 +705,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +}