bugfix: leaked semaphore error (#309)

* use config for n_cpu

* rm import

* fix process loop

* unuse mp.spawn

ref. https://discuss.pytorch.org/t/how-to-fix-a-sigsegv-in-pytorch-when-using-distributed-training-e-g-ddp/113518/10

* fix commentout
This commit is contained in:
N. Hiroto
2023-05-19 18:56:06 +09:00
committed by GitHub
parent 563c64ded9
commit 080b7cdc31
4 changed files with 19 additions and 17 deletions

View File

@@ -66,18 +66,22 @@ class EpochRecorder:
def main():
# n_gpus = torch.cuda.device_count()
n_gpus = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "51545"
mp.spawn(
run,
nprocs=n_gpus,
args=(
children = []
for i in range(n_gpus):
subproc = mp.Process(target=run, args=(
i,
n_gpus,
hps,
),
)
))
children.append(subproc)
subproc.start()
for i in range(n_gpus):
children[i].join()
def run(rank, n_gpus, hps):