From 575ef62c334b5edf6df38f996bcaadb321c837b8 Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:13:24 +0800 Subject: [PATCH 01/12] Update README_zh.md --- sat/README_zh.md | 52 ++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/sat/README_zh.md b/sat/README_zh.md index 8433a72..17e3f75 100644 --- a/sat/README_zh.md +++ b/sat/README_zh.md @@ -1,4 +1,4 @@ -# SAT CogView3 && CogView-3-Plus +# SAT CogView3 && CogView3-Plus 本文件夹包含了使用 [SAT](https://github.com/THUDM/SwissArmyTransformer) 权重的推理代码,以及 SAT 权重的微调代码。 @@ -18,12 +18,12 @@ pip install -r requirements.txt 以下链接为各个模型权重: -### CogView-3-Plus-3B +### CogView3-Plus-3B + transformer: https://cloud.tsinghua.edu.cn/d/f913eabd3f3b4e28857c + vae: https://cloud.tsinghua.edu.cn/d/af4cc066ce8a4cf2ab79 -### CogView-3-Base-3B +### CogView3-Base-3B + transformer: + cogview3-base: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0 @@ -34,7 +34,7 @@ pip install -r requirements.txt + vae: https://cloud.tsinghua.edu.cn/d/c8b9497fc5124d71818a/ -### CogView-3-Base-3B-Relay +### CogView3-Base-3B-Relay + transformer: + cogview3-relay: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/ @@ -43,12 +43,12 @@ pip install -r requirements.txt **以上三个版本为替换关系,选择适合自己的版本和对应的配置文件进行运行** -+ vae: 与 CogView-3-Base-3B 相同 ++ vae: 与 CogView3-Base-3B 相同 接着,你需要将模型文件排版成如下格式: ``` -.cogview3-plus-3b +cogview3-plus-3b ├── transformer │ ├── 1 │ │ └── mp_rank_00_model_states.pt @@ -72,6 +72,7 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl 通过上述方案,你将会得到一个 safetensor 格式的T5文件,确保在 Deepspeed微调过程中读入的时候不会报错。 ``` +t5-v1_1-xxl ├── added_tokens.json ├── config.json ├── model-00001-of-00002.safetensors @@ -91,22 +92,22 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl ```yaml args: mode: inference - relay_model: False # 当模型类型为 CogView-3-Relay 时,需要将该参数设置为 True - load: "cogview3_base/transformer" # 这里填写到transformer文件夹 + relay_model: False # 当模型类型为 CogView3-Relay 时,需要将该参数设置为 True + load: "cogview3-base-3b" # 这里填写到含有latest的文件夹 batch_size: 8 # 每次推理图像数 grid_num_columns: 2 # 推理结束后,每个提示词文件夹下会有 grid.png 图片,该数字代表列数。 input_type: txt # 可以选择命令行输入,或者TXT文件输入 input_file: configs/test.txt # 如果使用命令行,不需要这个参数 - fp16: True # CogView-3-Plus 模型 需要更换为 bf16 推理 + fp16: True # CogView3-Plus 模型 需要更换为 bf16 推理 # bf16: True sampling_image_size: 512 # 固定大小,支持512 * 512 分辨率图像 # CogView-3-Plus 模型可以使用以下两个参数。 # sampling_image_size_x: 1024 宽 # sampling_image_size_y: 1024 高 - output_dir: "outputs/cogview3_base-512x512" + output_dir: "outputs/cogview3_base_512x512" # # 这个部分是给 CogView-3-Relay 模型使用的,需要将该参数设置为推理模型的输入文件夹,提示词建议与 base 模型生成图片时的提示词的一致。 - # input_dir: "outputs/cogview3_base-512x512" + # input_dir: "outputs/cogview3_base_512x512" deepspeed_config: { } model: @@ -118,13 +119,14 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" # T5 safetensors的绝对路径 + model_dir: "t5-v1_1-xxl" # T5 safetensors的绝对路径 max_length: 225 # 支持输入的提示词的最大长度 first_stage_config: target: sgm.models.autoencoder.AutoencodingEngine params: - ckpt_path: "cogview3_base/vae/imagekl_ch16.pt" # VAE PT文件绝对路径 + ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" # VAE文件绝对路径 + # ckpt_path: "cogview3-plus-3b/vae/imagekl_ch16.pt" # CogView3-Plus VAE PT文件绝对路径 monitor: val/rec_loss ``` @@ -170,18 +172,20 @@ python sample_unet.py --base configs/cogview3_relay_distill_1step.yaml 其结构应该如下: ``` -. -├── 000000000.png -├── 000000001.png -├── 000000002.png -├── 000000003.png -├── 000000004.png -├── 000000005.png -├── 000000006.png -├── 000000007.png -└── grid.png +outputs +├── cogview3_base_512x512 + ├── 0_ + ├── 000000000.png + ├── 000000001.png + ├── 000000002.png + ├── 000000003.png + ├── 000000004.png + ├── 000000005.png + ├── 000000006.png + ├── 000000007.png + └── grid.png 1 directory, 9 files ``` -上述例子中,`batch` 为8。因此,有8张图像并带有一张`grid.png`的图像。 \ No newline at end of file +上述例子中,`batch` 为8。因此,有8张图像并带有一张`grid.png`的图像。 From 048db9116b489c8a6d4c8f82957402aac614b45d Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:13:25 +0800 Subject: [PATCH 02/12] Update README.md --- sat/README.md | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/sat/README.md b/sat/README.md index f2e8dc5..8658db4 100644 --- a/sat/README.md +++ b/sat/README.md @@ -1,4 +1,4 @@ -# SAT CogView3 & CogView-3-Plus +# SAT CogView3 & CogView3-Plus [Read this in Chinese](./README_zh.md) @@ -20,12 +20,12 @@ pip install -r requirements.txt The following links are for different model weights: -### CogView-3-Plus-3B +### CogView3-Plus-3B + transformer: https://cloud.tsinghua.edu.cn/d/f913eabd3f3b4e28857c + vae: https://cloud.tsinghua.edu.cn/d/af4cc066ce8a4cf2ab79 -### CogView-3-Base-3B +### CogView3-Base-3B + transformer: + cogview3-base: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0 @@ -36,7 +36,7 @@ The following links are for different model weights: + vae: https://cloud.tsinghua.edu.cn/d/c8b9497fc5124d71818a/ -### CogView-3-Base-3B-Relay +### CogView3-Base-3B-Relay + transformer: + cogview3-relay: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/ @@ -45,12 +45,12 @@ The following links are for different model weights: **These three versions are interchangeable. Choose the one that suits your needs and run it with the corresponding configuration file.** -+ vae: Same as CogView-3-Base-3B ++ vae: Same as CogView3-Base-3B Next, arrange the model files into the following format: ``` -.cogview3-plus-3b +cogview3-plus-3b ├── transformer │ ├── 1 │ │ └── mp_rank_00_model_states.pt @@ -73,6 +73,7 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl With this setup, you will have a safetensor format T5 file, ensuring no errors during Deepspeed fine-tuning. ``` +t5-v1_1-xxl ├── added_tokens.json ├── config.json ├── model-00001-of-00002.safetensors @@ -92,8 +93,8 @@ Here is an example using `CogView3-Base`, with explanations for some of the para ```yaml args: mode: inference - relay_model: False # Set to True when using CogView-3-Relay - load: "cogview3_base/transformer" # Path to the transformer folder + relay_model: False # Set to True when using CogView3-Relay + load: "cogview3-base-3b" # Path to the folder with latest batch_size: 8 # Number of images per inference grid_num_columns: 2 # Number of columns in grid.png output input_type: txt # Input can be from command line or TXT file @@ -105,9 +106,9 @@ args: # sampling_image_size_x: 1024 (width) # sampling_image_size_y: 1024 (height) - output_dir: "outputs/cogview3_base-512x512" - # This section is for CogView-3-Relay. Set the input_dir to the folder with base model generated images. - # input_dir: "outputs/cogview3_base-512x512" + output_dir: "outputs/cogview3_base_512x512" + # This section is for CogView3-Relay. Set the input_dir to the folder with base model generated images. + # input_dir: "outputs/cogview3_base_512x512" deepspeed_config: { } model: @@ -119,13 +120,14 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" # Path to T5 safetensors + model_dir: "t5-v1_1-xxl" # Path to T5 safetensors max_length: 225 # Maximum prompt length first_stage_config: target: sgm.models.autoencoder.AutoencodingEngine params: - ckpt_path: "cogview3_base/vae/imagekl_ch16.pt" # Path to VAE PT file + ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" # Path to VAE file + # ckpt_path: "cogview3-plus-3b/vae/imagekl_ch16.pt" # Path to CogView3-Plus VAE PT file monitor: val/rec_loss ``` @@ -170,16 +172,18 @@ python sample_unet.py --base configs/cogview3_relay_distill_1step.yaml The output image format will be a folder. The folder name will consist of the sequence number and the first 15 characters of the prompt, containing multiple images. The number of images is based on the `batch` parameter. The structure should look like this: ``` -. -├── 000000000.png -├── 000000001.png -├── 000000002.png -├── 000000003.png -├── 000000004.png -├── 000000005.png -├── 000000006.png -├── 000000007.png -└── grid.png +outputs +├── cogview3_base_512x512 + ├── 0_ + ├── 000000000.png + ├── 000000001.png + ├── 000000002.png + ├── 000000003.png + ├── 000000004.png + ├── 000000005.png + ├── 000000006.png + ├── 000000007.png + └── grid.png 1 directory, 9 files ``` From 69648ca55a23de7609b9e87495de7fed2ee93fa8 Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:14:13 +0800 Subject: [PATCH 03/12] Update requirements.txt --- sat/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sat/requirements.txt b/sat/requirements.txt index 43a2b98..1930e61 100644 --- a/sat/requirements.txt +++ b/sat/requirements.txt @@ -16,4 +16,5 @@ scipy>=1.14.1 SwissArmyTransformer>=0.4.12 tqdm>=4.66.5 wandb>=0.18.1 -openai>=1.48.0 \ No newline at end of file +openai>=1.48.0 +triton==2.1.0 From 2095cb2a2aa9ea1948911d6bdb960844c6af9ffc Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:17:21 +0800 Subject: [PATCH 04/12] Update cogview3_base.yaml --- sat/configs/cogview3_base.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sat/configs/cogview3_base.yaml b/sat/configs/cogview3_base.yaml index f03757d..2928cf0 100644 --- a/sat/configs/cogview3_base.yaml +++ b/sat/configs/cogview3_base.yaml @@ -1,15 +1,15 @@ args: mode: inference relay_model: False - load: "transformer" + load: "cogview3-base-3b" batch_size: 4 grid_num_columns: 2 input_type: txt - input_file: "configs/test_old.txt" + input_file: "configs/test.txt" fp16: True force_inference: True sampling_image_size: 512 - output_dir: "outputs/cogview3_base-512x512" + output_dir: "outputs/cogview3_base_512x512" deepspeed_config: { } model: @@ -61,7 +61,7 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" + model_dir: "t5-v1_1-xxl" max_length: 225 # vector cond @@ -86,7 +86,7 @@ model: first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: - ckpt_path: "vae/sdxl_vae.safetensors" + ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" embed_dim: 4 monitor: val/rec_loss ddconfig: From 30dbd896868c1a4c0283c0501c296e0f2b7f33ac Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:29:56 +0800 Subject: [PATCH 05/12] Update cogview3_base_distill_4step.yaml --- sat/configs/cogview3_base_distill_4step.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sat/configs/cogview3_base_distill_4step.yaml b/sat/configs/cogview3_base_distill_4step.yaml index 2f832c9..df58499 100644 --- a/sat/configs/cogview3_base_distill_4step.yaml +++ b/sat/configs/cogview3_base_distill_4step.yaml @@ -1,7 +1,7 @@ args: mode: inference relay_model: False - load: "transformer" + load: "cogview3-base-3b-distill-4step" batch_size: 4 grid_num_columns: 2 input_type: txt @@ -9,7 +9,7 @@ args: fp16: True force_inference: True sampling_image_size: 512 - output_dir: "outputs/cogview3_base_distill-4step" + output_dir: "outputs/cogview3_base_distill_4step" deepspeed_config: {} model: @@ -61,7 +61,7 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" + model_dir: "t5-v1_1-xxl" max_length: 225 # vector cond @@ -86,7 +86,7 @@ model: first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: - ckpt_path: "vae/sdxl_vae.safetensors" + ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" embed_dim: 4 monitor: val/rec_loss ddconfig: From 739ad0b212eb6aef5d03fc2c04afa478c0b4f3f5 Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:30:25 +0800 Subject: [PATCH 06/12] Update cogview3_base_distill_4step.yaml --- sat/configs/cogview3_base_distill_4step.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sat/configs/cogview3_base_distill_4step.yaml b/sat/configs/cogview3_base_distill_4step.yaml index df58499..849ab95 100644 --- a/sat/configs/cogview3_base_distill_4step.yaml +++ b/sat/configs/cogview3_base_distill_4step.yaml @@ -94,7 +94,7 @@ model: double_z: true z_channels: 4 resolution: 256 - in_channels: 3f + in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1, 2, 4, 4 ] From 5f384879276b802b5595595e2d92a8b873d430d6 Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:32:27 +0800 Subject: [PATCH 07/12] Update cogview3_plus.yaml --- sat/configs/cogview3_plus.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sat/configs/cogview3_plus.yaml b/sat/configs/cogview3_plus.yaml index 30b9a30..baca1c0 100644 --- a/sat/configs/cogview3_plus.yaml +++ b/sat/configs/cogview3_plus.yaml @@ -1,7 +1,7 @@ args: mode: inference relay_model: False - load: "transformer" + load: "cogview3-plus-3b/transformer" batch_size: 4 grid_num_columns: 2 input_type: txt @@ -77,7 +77,7 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" + model_dir: "t5-v1_1-xxl" max_length: 224 # vector cond - is_trainable: False @@ -101,7 +101,7 @@ model: first_stage_config: target: sgm.models.autoencoder.AutoencodingEngine params: - ckpt_path: "vae/imagekl_ch16.pt" + ckpt_path: "cogview3-plus-3b/vae/imagekl_ch16.pt" monitor: val/rec_loss loss_config: From ee28c22f0e49d06457c63e8ef9b3f1d184a00873 Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:35:56 +0800 Subject: [PATCH 08/12] Update cogview3_relay.yaml --- sat/configs/cogview3_relay.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sat/configs/cogview3_relay.yaml b/sat/configs/cogview3_relay.yaml index b9a3056..a2994f7 100644 --- a/sat/configs/cogview3_relay.yaml +++ b/sat/configs/cogview3_relay.yaml @@ -1,7 +1,7 @@ args: mode: inference relay_model: True - load: "transformer" + load: "cogview3-relay-3b" batch_size: 4 grid_num_columns: 2 input_type: txt @@ -9,8 +9,8 @@ args: fp16: True force_inference: True sampling_image_size: 1024 - output_dir: "outputs/cogview3_relay-1024x1024" - input_dir: "outputs/cogview3_base-512x512" + output_dir: "outputs/cogview3_relay_1024x1024" + input_dir: "outputs/cogview3_base_512x512" deepspeed_config: { } model: @@ -63,7 +63,7 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" + model_dir: "t5-v1_1-xxl" max_length: 225 # vector cond - is_trainable: False @@ -87,7 +87,7 @@ model: first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: - ckpt_path: "vae/sdxl_vae.safetensors" + ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" embed_dim: 4 monitor: val/rec_loss ddconfig: From 2f1af87739476221b5ef4486815dfb5520b39eba Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:39:10 +0800 Subject: [PATCH 09/12] Update cogview3_relay_distill_1step.yaml --- sat/configs/cogview3_relay_distill_1step.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sat/configs/cogview3_relay_distill_1step.yaml b/sat/configs/cogview3_relay_distill_1step.yaml index 480de44..3793776 100644 --- a/sat/configs/cogview3_relay_distill_1step.yaml +++ b/sat/configs/cogview3_relay_distill_1step.yaml @@ -1,6 +1,7 @@ args: mode: inference - load: "transformer" + relay_model: True + load: "cogview3-relay-3b-distill-1step" batch_size: 4 grid_num_columns: 2 input_type: txt @@ -9,7 +10,7 @@ args: force_inference: True sampling_image_size: 1024 # 这个值应该是你输入的图像分辨率的两倍 output_dir: "outputs/cogview3_relay_distill_1step" - input_dir: "inputs" # the inputs image should follow the order of input_file or cli input + input_dir: "outputs/cogview3_base_512x512" # the inputs image should follow the order of input_file or cli input deepspeed_config: { } model: @@ -63,7 +64,7 @@ model: input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: - model_dir: "google/t5-v1_1-xxl" + model_dir: "t5-v1_1-xxl" max_length: 225 # vector cond - is_trainable: False @@ -87,7 +88,7 @@ model: first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: - ckpt_path: "vae/sdxl_vae.safetensors" + ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" embed_dim: 4 monitor: val/rec_loss ddconfig: From ccd0f6a6c297629bcd8ce7b9f52230f618deaeef Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:39:57 +0800 Subject: [PATCH 10/12] Add files via upload --- sat/configs/test.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 sat/configs/test.txt diff --git a/sat/configs/test.txt b/sat/configs/test.txt new file mode 100644 index 0000000..f59cc91 --- /dev/null +++ b/sat/configs/test.txt @@ -0,0 +1 @@ +Model portrait with pink hair, her long hair is soft and flowy. At dawn, she was surrounded by delicate flowers in the misty countryside. The style should be ethereal and dreamy, with soft and bright sunlight shining on her face to create a soft atmosphere. Her face has a classical beauty, her eyes are large, deep and bright, and her facial expressions reflect calmness, mystery and elegance, adding to the overall surrealist atmosphere. (Good proportions, cinematic angle:1.3) \ No newline at end of file From 68749b391d998b18c7de9b42d6343f8a7cb5ebb6 Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:45:50 +0800 Subject: [PATCH 11/12] Update README_zh.md --- sat/README_zh.md | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sat/README_zh.md b/sat/README_zh.md index 17e3f75..4f7787b 100644 --- a/sat/README_zh.md +++ b/sat/README_zh.md @@ -26,9 +26,9 @@ pip install -r requirements.txt ### CogView3-Base-3B + transformer: - + cogview3-base: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0 - + cogview3-base-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e - + cogview3-base-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae + + cogview3-base-3b: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0 + + cogview3-base-3b-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e + + cogview3-base-3b-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae + **以上三个版本为替换关系,选择适合自己的版本和对应的配置文件进行运行** @@ -37,9 +37,9 @@ pip install -r requirements.txt ### CogView3-Base-3B-Relay + transformer: - + cogview3-relay: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/ - + cogview3-relay-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402 - + cogview3-relay-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/ + + cogview3-relay-3b: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/ + + cogview3-relay-3b-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402 + + cogview3-relay-3b-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/ **以上三个版本为替换关系,选择适合自己的版本和对应的配置文件进行运行** @@ -55,6 +55,12 @@ cogview3-plus-3b │ └── latest └── vae └── imagekl_ch16.pt +cogview3-base-3b +├── 1 +│ └──mp_rank_00_model_states.pt +└──latest +cogview3-base-3b-vae +└──sdxl_vae.safetensors ``` 克隆 T5 模型,该模型不用做训练和微调,但是必须使用。这里,您可以单独下载T5模型,必须是`safetensors`类型,不能是`bin` @@ -173,8 +179,8 @@ python sample_unet.py --base configs/cogview3_relay_distill_1step.yaml ``` outputs -├── cogview3_base_512x512 - ├── 0_ +└── cogview3_base_512x512 + └── 0_ ├── 000000000.png ├── 000000001.png ├── 000000002.png From d5e8d16de5161d313d77c9814d884daf1f1fd8df Mon Sep 17 00:00:00 2001 From: gluttony-10 <52977964+gluttony-10@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:45:52 +0800 Subject: [PATCH 12/12] Update README.md --- sat/README.md | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sat/README.md b/sat/README.md index 8658db4..476ebcc 100644 --- a/sat/README.md +++ b/sat/README.md @@ -28,9 +28,9 @@ The following links are for different model weights: ### CogView3-Base-3B + transformer: - + cogview3-base: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0 - + cogview3-base-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e - + cogview3-base-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae + + cogview3-base-3b: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0 + + cogview3-base-3b-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e + + cogview3-base-3b-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae **These three versions are interchangeable. Choose the one that suits your needs and run it with the corresponding configuration file.** @@ -39,9 +39,9 @@ The following links are for different model weights: ### CogView3-Base-3B-Relay + transformer: - + cogview3-relay: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/ - + cogview3-relay-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402 - + cogview3-relay-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/ + + cogview3-relay-3b: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/ + + cogview3-relay-3b-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402 + + cogview3-relay-3b-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/ **These three versions are interchangeable. Choose the one that suits your needs and run it with the corresponding configuration file.** @@ -57,6 +57,12 @@ cogview3-plus-3b │ └── latest └── vae └── imagekl_ch16.pt +cogview3-base-3b +├── 1 +│ └──mp_rank_00_model_states.pt +└──latest +cogview3-base-3b-vae +└──sdxl_vae.safetensors ``` Clone the T5 model. This model is not used for training or fine-tuning but is necessary. You can download the T5 model separately, but it must be in `safetensors` format, not `bin` format (otherwise an error may occur). @@ -173,8 +179,8 @@ The output image format will be a folder. The folder name will consist of the se ``` outputs -├── cogview3_base_512x512 - ├── 0_ +└── cogview3_base_512x512 + └── 0_ ├── 000000000.png ├── 000000001.png ├── 000000002.png