THUDM · gluttony-10 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/sat/README.md b/sat/README.md
@@ -1,4 +1,4 @@
-# SAT CogView3 & CogView-3-Plus
+# SAT CogView3 & CogView3-Plus
 
 [Read this in Chinese](./README_zh.md)
 
@@ -20,43 +20,49 @@ pip install -r requirements.txt
 
 The following links are for different model weights:
 
-### CogView-3-Plus-3B
+### CogView3-Plus-3B
 
 + transformer: https://cloud.tsinghua.edu.cn/d/f913eabd3f3b4e28857c
 + vae: https://cloud.tsinghua.edu.cn/d/af4cc066ce8a4cf2ab79
 
-### CogView-3-Base-3B
+### CogView3-Base-3B
 
 + transformer:
-    + cogview3-base: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0
-    + cogview3-base-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e
-    + cogview3-base-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae
+    + cogview3-base-3b: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0
+    + cogview3-base-3b-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e
+    + cogview3-base-3b-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae
 
   **These three versions are interchangeable. Choose the one that suits your needs and run it with the corresponding configuration file.**
 
 + vae: https://cloud.tsinghua.edu.cn/d/c8b9497fc5124d71818a/ 
 
-### CogView-3-Base-3B-Relay
+### CogView3-Base-3B-Relay
 
 + transformer:
-    + cogview3-relay: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/
-    + cogview3-relay-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402
-    + cogview3-relay-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/
+    + cogview3-relay-3b: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/
+    + cogview3-relay-3b-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402
+    + cogview3-relay-3b-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/
 
   **These three versions are interchangeable. Choose the one that suits your needs and run it with the corresponding configuration file.**
 
-+ vae: Same as CogView-3-Base-3B
++ vae: Same as CogView3-Base-3B
 
 Next, arrange the model files into the following format:
 
 ```
-.cogview3-plus-3b
+cogview3-plus-3b
 ├── transformer
 │   ├── 1
 │   │   └── mp_rank_00_model_states.pt
 │   └── latest
 └── vae
     └── imagekl_ch16.pt
+cogview3-base-3b
+├── 1
+│   └──mp_rank_00_model_states.pt
+└──latest
+cogview3-base-3b-vae
+└──sdxl_vae.safetensors
 ```
 
 Clone the T5 model. This model is not used for training or fine-tuning but is necessary. You can download the T5 model separately, but it must be in `safetensors` format, not `bin` format (otherwise an error may occur).
@@ -73,6 +79,7 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
 With this setup, you will have a safetensor format T5 file, ensuring no errors during Deepspeed fine-tuning.
 
 ```
+t5-v1_1-xxl
 ├── added_tokens.json
 ├── config.json
 ├── model-00001-of-00002.safetensors
@@ -92,8 +99,8 @@ Here is an example using `CogView3-Base`, with explanations for some of the para
 ```yaml
 args:
   mode: inference
-  relay_model: False # Set to True when using CogView-3-Relay
-  load: "cogview3_base/transformer" # Path to the transformer folder
+  relay_model: False # Set to True when using CogView3-Relay
+  load: "cogview3-base-3b" # Path to the folder with latest
   batch_size: 8 # Number of images per inference
   grid_num_columns: 2 # Number of columns in grid.png output
   input_type: txt # Input can be from command line or TXT file
@@ -105,9 +112,9 @@ args:
   # sampling_image_size_x: 1024 (width)
   # sampling_image_size_y: 1024 (height)
 
-  output_dir: "outputs/cogview3_base-512x512"
-  # This section is for CogView-3-Relay. Set the input_dir to the folder with base model generated images.
-  # input_dir: "outputs/cogview3_base-512x512" 
+  output_dir: "outputs/cogview3_base_512x512"
+  # This section is for CogView3-Relay. Set the input_dir to the folder with base model generated images.
+  # input_dir: "outputs/cogview3_base_512x512" 
   deepspeed_config: { }
 
 model:
@@ -119,13 +126,14 @@ model:
         input_key: txt
         target: sgm.modules.encoders.modules.FrozenT5Embedder
         params:
-          model_dir: "google/t5-v1_1-xxl" # Path to T5 safetensors
+          model_dir: "t5-v1_1-xxl" # Path to T5 safetensors
           max_length: 225 # Maximum prompt length
 
   first_stage_config:
     target: sgm.models.autoencoder.AutoencodingEngine
     params:
-      ckpt_path: "cogview3_base/vae/imagekl_ch16.pt" # Path to VAE PT file
+      ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" # Path to VAE file
+      # ckpt_path: "cogview3-plus-3b/vae/imagekl_ch16.pt" # Path to CogView3-Plus VAE PT file
       monitor: val/rec_loss
 ```
 
@@ -170,16 +178,18 @@ python sample_unet.py --base configs/cogview3_relay_distill_1step.yaml
 The output image format will be a folder. The folder name will consist of the sequence number and the first 15 characters of the prompt, containing multiple images. The number of images is based on the `batch` parameter. The structure should look like this:
 
 ```
-.
-├── 000000000.png
-├── 000000001.png
-├── 000000002.png
-├── 000000003.png
-├── 000000004.png
-├── 000000005.png
-├── 000000006.png
-├── 000000007.png
-└── grid.png
+outputs
+└── cogview3_base_512x512
+    └── 0_
+        ├── 000000000.png
+        ├── 000000001.png
+        ├── 000000002.png
+        ├── 000000003.png
+        ├── 000000004.png
+        ├── 000000005.png
+        ├── 000000006.png
+        ├── 000000007.png
+        └── grid.png
 
 1 directory, 9 files
 ```

diff --git a/sat/README_zh.md b/sat/README_zh.md
@@ -1,4 +1,4 @@
-# SAT CogView3 && CogView-3-Plus
+# SAT CogView3 && CogView3-Plus
 
 本文件夹包含了使用 [SAT](https://github.com/THUDM/SwissArmyTransformer) 权重的推理代码，以及 SAT 权重的微调代码。
 
@@ -18,43 +18,49 @@ pip install -r requirements.txt
 
 以下链接为各个模型权重:
 
-### CogView-3-Plus-3B
+### CogView3-Plus-3B
 
 + transformer: https://cloud.tsinghua.edu.cn/d/f913eabd3f3b4e28857c
 + vae: https://cloud.tsinghua.edu.cn/d/af4cc066ce8a4cf2ab79
 
-### CogView-3-Base-3B
+### CogView3-Base-3B
 
 + transformer:
-    + cogview3-base: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0
-    + cogview3-base-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e
-    + cogview3-base-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae
+    + cogview3-base-3b: https://cloud.tsinghua.edu.cn/d/242b66daf4424fa99bf0
+    + cogview3-base-3b-distill-4step: https://cloud.tsinghua.edu.cn/d/d10032a94db647f5aa0e
+    + cogview3-base-3b-distill-8step: https://cloud.tsinghua.edu.cn/d/1598d4fe4ebf4afcb6ae
     + 
   **以上三个版本为替换关系，选择适合自己的版本和对应的配置文件进行运行**
 
 + vae: https://cloud.tsinghua.edu.cn/d/c8b9497fc5124d71818a/ 
 
-### CogView-3-Base-3B-Relay
+### CogView3-Base-3B-Relay
 
 + transformer:
-    + cogview3-relay: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/
-    + cogview3-relay-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402
-    + cogview3-relay-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/
+    + cogview3-relay-3b: https://cloud.tsinghua.edu.cn/d/134951acced949c1a9e1/
+    + cogview3-relay-3b-distill-2step: https://cloud.tsinghua.edu.cn/d/6a902976fcb94ac48402
+    + cogview3-relay-3b-distill-1step: https://cloud.tsinghua.edu.cn/d/4d50ec092c64418f8418/
 
   **以上三个版本为替换关系，选择适合自己的版本和对应的配置文件进行运行**
 
-+ vae: 与 CogView-3-Base-3B 相同
++ vae: 与 CogView3-Base-3B 相同
 
 接着，你需要将模型文件排版成如下格式：
 
 ```
-.cogview3-plus-3b
+cogview3-plus-3b
 ├── transformer
 │   ├── 1
 │   │   └── mp_rank_00_model_states.pt
 │   └── latest
 └── vae
     └── imagekl_ch16.pt
+cogview3-base-3b
+├── 1
+│   └──mp_rank_00_model_states.pt
+└──latest
+cogview3-base-3b-vae
+└──sdxl_vae.safetensors
 ```
 
 克隆 T5 模型，该模型不用做训练和微调，但是必须使用。这里，您可以单独下载T5模型，必须是`safetensors`类型，不能是`bin`
@@ -72,6 +78,7 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
 通过上述方案，你将会得到一个 safetensor 格式的T5文件，确保在 Deepspeed微调过程中读入的时候不会报错。
 
 ```
+t5-v1_1-xxl
 ├── added_tokens.json
 ├── config.json
 ├── model-00001-of-00002.safetensors
@@ -91,22 +98,22 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
 ```yaml
 args:
   mode: inference
-  relay_model: False # 当模型类型为 CogView-3-Relay 时，需要将该参数设置为 True
-  load: "cogview3_base/transformer" # 这里填写到transformer文件夹
+  relay_model: False # 当模型类型为 CogView3-Relay 时，需要将该参数设置为 True
+  load: "cogview3-base-3b" # 这里填写到含有latest的文件夹
   batch_size: 8 # 每次推理图像数
   grid_num_columns: 2 # 推理结束后，每个提示词文件夹下会有 grid.png 图片，该数字代表列数。
   input_type: txt # 可以选择命令行输入，或者TXT文件输入
   input_file: configs/test.txt # 如果使用命令行，不需要这个参数
-  fp16: True # CogView-3-Plus 模型 需要更换为 bf16 推理
+  fp16: True # CogView3-Plus 模型 需要更换为 bf16 推理
   # bf16: True
   sampling_image_size: 512 # 固定大小，支持512 * 512 分辨率图像
   # CogView-3-Plus 模型可以使用以下两个参数。
   # sampling_image_size_x: 1024 宽 
   # sampling_image_size_y: 1024 高
 
-  output_dir: "outputs/cogview3_base-512x512"
+  output_dir: "outputs/cogview3_base_512x512"
   # # 这个部分是给 CogView-3-Relay 模型使用的，需要将该参数设置为推理模型的输入文件夹，提示词建议与 base 模型生成图片时的提示词的一致。
-  # input_dir: "outputs/cogview3_base-512x512" 
+  # input_dir: "outputs/cogview3_base_512x512" 
   deepspeed_config: { }
 
 model:
@@ -118,13 +125,14 @@ model:
         input_key: txt
         target: sgm.modules.encoders.modules.FrozenT5Embedder
         params:
-          model_dir: "google/t5-v1_1-xxl" # T5 safetensors的绝对路径
+          model_dir: "t5-v1_1-xxl" # T5 safetensors的绝对路径
           max_length: 225 # 支持输入的提示词的最大长度
 
   first_stage_config:
     target: sgm.models.autoencoder.AutoencodingEngine
     params:
-      ckpt_path: "cogview3_base/vae/imagekl_ch16.pt" # VAE PT文件绝对路径
+      ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors" # VAE文件绝对路径
+      # ckpt_path: "cogview3-plus-3b/vae/imagekl_ch16.pt" # CogView3-Plus VAE PT文件绝对路径
       monitor: val/rec_loss
 ```
 
@@ -170,18 +178,20 @@ python sample_unet.py --base configs/cogview3_relay_distill_1step.yaml
 其结构应该如下：
 
 ```
-.
-├── 000000000.png
-├── 000000001.png
-├── 000000002.png
-├── 000000003.png
-├── 000000004.png
-├── 000000005.png
-├── 000000006.png
-├── 000000007.png
-└── grid.png
+outputs
+└── cogview3_base_512x512
+    └── 0_
+        ├── 000000000.png
+        ├── 000000001.png
+        ├── 000000002.png
+        ├── 000000003.png
+        ├── 000000004.png
+        ├── 000000005.png
+        ├── 000000006.png
+        ├── 000000007.png
+        └── grid.png
 
 1 directory, 9 files
 ```
 
-上述例子中，`batch` 为8。因此，有8张图像并带有一张`grid.png`的图像。
+上述例子中，`batch` 为8。因此，有8张图像并带有一张`grid.png`的图像。
diff --git a/sat/configs/cogview3_base.yaml b/sat/configs/cogview3_base.yaml
@@ -1,15 +1,15 @@
 args:
   mode: inference
   relay_model: False
-  load: "transformer"
+  load: "cogview3-base-3b"
   batch_size: 4
   grid_num_columns: 2
   input_type: txt
-  input_file: "configs/test_old.txt"
+  input_file: "configs/test.txt"
   fp16: True
   force_inference: True
   sampling_image_size: 512
-  output_dir: "outputs/cogview3_base-512x512"
+  output_dir: "outputs/cogview3_base_512x512"
   deepspeed_config: { }
 
 model:
@@ -61,7 +61,7 @@ model:
           input_key: txt
           target: sgm.modules.encoders.modules.FrozenT5Embedder
           params:
-            model_dir: "google/t5-v1_1-xxl"
+            model_dir: "t5-v1_1-xxl"
             max_length: 225
 
         # vector cond
@@ -86,7 +86,7 @@ model:
   first_stage_config:
     target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
     params:
-      ckpt_path: "vae/sdxl_vae.safetensors"
+      ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors"
       embed_dim: 4
       monitor: val/rec_loss
       ddconfig:

diff --git a/sat/configs/cogview3_base_distill_4step.yaml b/sat/configs/cogview3_base_distill_4step.yaml
@@ -1,15 +1,15 @@
 args:
   mode: inference
   relay_model: False
-  load: "transformer"
+  load: "cogview3-base-3b-distill-4step"
   batch_size: 4
   grid_num_columns: 2
   input_type: txt
   input_file: "configs/test.txt"
   fp16: True
   force_inference: True
   sampling_image_size: 512
-  output_dir: "outputs/cogview3_base_distill-4step"
+  output_dir: "outputs/cogview3_base_distill_4step"
   deepspeed_config: {}
 
 model:
@@ -61,7 +61,7 @@ model:
             input_key: txt
             target: sgm.modules.encoders.modules.FrozenT5Embedder
             params:
-              model_dir: "google/t5-v1_1-xxl"
+              model_dir: "t5-v1_1-xxl"
               max_length: 225
 
           # vector cond
@@ -86,15 +86,15 @@ model:
   first_stage_config:
     target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
     params:
-      ckpt_path: "vae/sdxl_vae.safetensors"
+      ckpt_path: "cogview3-base-3b-vae/sdxl_vae.safetensors"
       embed_dim: 4
       monitor: val/rec_loss
       ddconfig:
         attn_type: vanilla-xformers
         double_z: true
         z_channels: 4
         resolution: 256
-        in_channels: 3f
+        in_channels: 3
         out_ch: 3
         ch: 128
         ch_mult: [ 1, 2, 4, 4 ]