bugfix: leaked semaphore error (#309)

* use config for n_cpu * rm import * fix process loop * unuse mp.spawn ref. https://discuss.pytorch.org/t/how-to-fix-a-sigsegv-in-pytorch-when-using-distributed-training-e-g-ddp/113518/10 * fix commentout
2025-12-23 23:20:10 +01:00 · 2023-05-19 18:56:06 +09:00
parent 563c64ded9
commit 080b7cdc31
4 changed files with 19 additions and 17 deletions
--- a/train_nsf_sim_cache_sid_load_pretrain.py
+++ b/train_nsf_sim_cache_sid_load_pretrain.py
@@ -66,18 +66,22 @@ class EpochRecorder:


 def main():
-    # n_gpus = torch.cuda.device_count()
+    n_gpus = torch.cuda.device_count()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "51545"

-    mp.spawn(
-        run,
-        nprocs=n_gpus,
-        args=(
+    children = []
+    for i in range(n_gpus):
+        subproc = mp.Process(target=run, args=(
+            i,
            n_gpus,
            hps,
-        ),
-    )
+        ))
+        children.append(subproc)
+        subproc.start()
+
+    for i in range(n_gpus):
+        children[i].join()


 def run(rank, n_gpus, hps):