eval ready

kwea123 · Jan 23, 2021 · f72c67a · f72c67a
1 parent 465202a
commit f72c67a
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ Download the pretrained models and training logs in [release](https://github.com
 
 # :mag_right: Testing
 
-See [test.ipynb](test.ipynb) for a simple view synthesis and depth prediction on 1 image.
+Example: [test_nerf-u.ipynb](test_nerf-u.ipynb) shows how NeRF-U successfully decomposes the scene into static and transient components.
 
 Use [eval.py](eval.py) to create the whole sequence of moving views.
 E.g.
@@ -79,14 +79,22 @@ python eval.py \
    --dataset_name blender --scene_name lego \
    --img_wh 400 400 --N_importance 64 --ckpt_path $CKPT_PATH
 ```
-**IMPORTANT** : Don't forget to add `--spheric_poses` if the model is trained under `--spheric` setting!
 
 It will create folder `results/{dataset_name}/{scene_name}` and run inference on all test data, finally create a gif out of them.
 
-
+Example of lego scene using pretrained **NeRF-U** model under **occluder** condition: (PSNR=28.60, paper=23.47)
+![nerf-u](https://user-images.githubusercontent.com/11364490/105578186-a9933400-5dc1-11eb-8865-e276b581d8fd.gif)
 
 # :warning: Notes on differences with the original repo
 
-*  The learning rate decay in the original repo is **by step**, which means it decreases every step, here I use learning rate decay **by epoch**, which means it changes only at the end of 1 epoch.
-*  The validation image for LLFF dataset is chosen as the most centered image here, whereas the original repo chooses every 8th image.
-*  The rendering spiral path is slightly different from the original repo (I use approximate values to simplify the code).
+*  Network structure ([nerf.py](models/nerf.py)):
+  *  My base MLP uses 8 layers of 256 units as the original NeRF, while NeRF-W uses **512** units each.
+  *  My static head uses 1 layer as the original NeRF, while NeRF-W uses **4** layers.
+  *  I use **softplus** activation for sigma (reason explained [here](https://github.com/bmild/nerf/issues/29#issuecomment-765335765)) while NeRF-W uses **relu**.
+
+*  Training hyperparameters
+  *  I find larger `beta_min` achieves better result, so my default `beta_min` is `0.1` instead of `0.03` in the paper.
+  *  I add 3 to `beta_loss` (equation 13) to make it positive empirically.
+
+*  Evalutaion
+  *  The evaluation metric is computed on the **test** set, while NeRF evaluates on val and test combined.
diff --git a/eval.py b/eval.py
@@ -23,17 +23,16 @@ def get_opts():
                         default='/home/ubuntu/data/nerf_example_data/nerf_synthetic/lego',
                         help='root directory of dataset')
     parser.add_argument('--dataset_name', type=str, default='blender',
-                        choices=['blender', 'llff'],
+                        choices=['blender'],
                         help='which dataset to validate')
     parser.add_argument('--scene_name', type=str, default='test',
                         help='scene name, used as output folder name')
-    parser.add_argument('--split', type=str, default='test',
-                        help='test or test_train')
+    parser.add_argument('--split', type=str, default='val',
+                        choices=['val', 'test', 'test_train'])
     parser.add_argument('--img_wh', nargs="+", type=int, default=[800, 800],
                         help='resolution (img_w, img_h) of the image')
-    parser.add_argument('--spheric_poses', default=False, action="store_true",
-                        help='whether images are taken in spheric poses (for llff)')
 
+    # original NeRF parameters
     parser.add_argument('--N_emb_xyz', type=int, default=10,
                         help='number of xyz embedding frequencies')
     parser.add_argument('--N_emb_dir', type=int, default=4,
@@ -45,9 +44,19 @@ def get_opts():
     parser.add_argument('--use_disp', default=False, action="store_true",
                         help='use disparity depth sampling')
 
+    # NeRF-W parameters
+    parser.add_argument('--N_vocab', type=int, default=100,
+                        help='''number of vocabulary (number of images) 
+                                in the dataset for nn.Embedding''')
+    parser.add_argument('--encode_a', default=False, action="store_true",
+                        help='whether to encode appearance (NeRF-A)')
+    parser.add_argument('--N_a', type=int, default=48,
+                        help='number of embeddings for appearance')
+    parser.add_argument('--encode_t', default=False, action="store_true",
+                        help='whether to encode transient object (NeRF-U)')
     parser.add_argument('--N_tau', type=int, default=16,
                         help='number of embeddings for transient objects')
-    parser.add_argument('--beta_min', type=float, default=0.03,
+    parser.add_argument('--beta_min', type=float, default=0.1,
                         help='minimum color variance for each ray')
 
     parser.add_argument('--chunk', type=int, default=32*1024*4,
@@ -56,12 +65,6 @@ def get_opts():
     parser.add_argument('--ckpt_path', type=str, required=True,
                         help='pretrained checkpoint path to load')
 
-    parser.add_argument('--save_depth', default=False, action="store_true",
-                        help='whether to save depth prediction')
-    parser.add_argument('--depth_format', type=str, default='pfm',
-                        choices=['pfm', 'bytes'],
-                        help='which format to save')
-
     return parser.parse_args()
 
 
@@ -103,24 +106,32 @@ def batched_inference(models, embeddings,
     kwargs = {'root_dir': args.root_dir,
               'split': args.split,
               'img_wh': tuple(args.img_wh)}
-    if args.dataset_name == 'llff':
-        kwargs['spheric_poses'] = args.spheric_poses
     dataset = dataset_dict[args.dataset_name](**kwargs)
 
-    embedding_t = torch.nn.Embedding(200, args.N_tau)
     embedding_xyz = PosEmbedding(args.N_emb_xyz-1, args.N_emb_xyz)
     embedding_dir = PosEmbedding(args.N_emb_dir-1, args.N_emb_dir)
-    nerf_coarse = NeRF('coarse')
-    nerf_fine = NeRF('fine', beta_min=args.beta_min)
-    load_ckpt(embedding_t, args.ckpt_path, model_name='embedding_t')
+    embeddings = {'xyz': embedding_xyz, 'dir': embedding_dir}
+    if args.encode_a:
+        embedding_a = torch.nn.Embedding(args.N_vocab, args.N_a).cuda()
+        load_ckpt(embedding_a, args.ckpt_path, model_name='embedding_a')
+        embeddings['a'] = embedding_a
+    if args.encode_t:
+        embedding_t = torch.nn.Embedding(args.N_vocab, args.N_tau).cuda()
+        load_ckpt(embedding_t, args.ckpt_path, model_name='embedding_t')
+        embeddings['t'] = embedding_t
+
+    nerf_coarse = NeRF('coarse').cuda()
+    nerf_fine = NeRF('fine',
+                    encode_appearance=args.encode_a,
+                    in_channels_a=args.N_a,
+                    encode_transient=args.encode_t,
+                    in_channels_t=args.N_tau,
+                    beta_min=args.beta_min).cuda()
+
     load_ckpt(nerf_coarse, args.ckpt_path, model_name='nerf_coarse')
     load_ckpt(nerf_fine, args.ckpt_path, model_name='nerf_fine')
-    embedding_t.cuda()
-    nerf_coarse.cuda()
-    nerf_fine.cuda()
 
     models = {'coarse': nerf_coarse, 'fine': nerf_fine}
-    embeddings = {'xyz': embedding_xyz, 'dir': embedding_dir, 't': embedding_t}
 
     imgs, psnrs = [], []
     dir_name = f'results/{args.dataset_name}/{args.scene_name}'
@@ -137,15 +148,6 @@ def batched_inference(models, embeddings,
 
         img_pred = results['rgb_fine'].view(h, w, 3).cpu().numpy()
 
-        if args.save_depth:
-            depth_pred = results['depth_fine'].view(h, w).cpu().numpy()
-            depth_pred = np.nan_to_num(depth_pred)
-            if args.depth_format == 'pfm':
-                save_pfm(os.path.join(dir_name, f'depth_{i:03d}.pfm'), depth_pred)
-            else:
-                with open(f'depth_{i:03d}', 'wb') as f:
-                    f.write(depth_pred.tobytes())
-
         img_pred_ = (img_pred*255).astype(np.uint8)
         imgs += [img_pred_]
         imageio.imwrite(os.path.join(dir_name, f'{i:03d}.png'), img_pred_)

diff --git a/opt.py b/opt.py
@@ -46,7 +46,7 @@ def get_opts():
                         help='whether to encode transient object (NeRF-U)')
     parser.add_argument('--N_tau', type=int, default=16,
                         help='number of embeddings for transient objects')
-    parser.add_argument('--beta_min', type=float, default=0.03,
+    parser.add_argument('--beta_min', type=float, default=0.1,
                         help='minimum color variance for each ray')
 
     parser.add_argument('--batch_size', type=int, default=1024,

diff --git a/test_nerf-u.ipynb b/test_nerf-u.ipynb
@@ -38,6 +38,11 @@
     "N_tau = 16\n",
     "beta_min = 0.1\n",
     "ckpt_path = 'ckpts/lego_nerfw_occ2/epoch=19.ckpt'\n",
+    "\n",
+    "N_samples = 64\n",
+    "N_importance = 64\n",
+    "use_disp = False\n",
+    "chunk = 1024*32\n",
     "#############################\n",
     "\n",
     "embedding_xyz = PosEmbedding(9, 10)\n",
@@ -73,11 +78,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "N_samples = 64\n",
-    "N_importance = 64\n",
-    "use_disp = False\n",
-    "chunk = 1024*32\n",
-    "\n",
     "@torch.no_grad()\n",
     "def f(rays, ts):\n",
     "    \"\"\"Do batched inference on rays using chunk.\"\"\"\n",