From 8858e98e0781b27420a2ac2866fe747ccbaf4f32 Mon Sep 17 00:00:00 2001 From: fanhy36 Date: Tue, 11 Mar 2025 10:22:01 +0800 Subject: [PATCH] fix issue75: seg fault and listen volcano.sock fail Signed-off-by: fanhy36 delete empty line Signed-off-by: fanhy36 --- pkg/plugin/nvidia/server.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pkg/plugin/nvidia/server.go b/pkg/plugin/nvidia/server.go index cf312a89a..c9c83b4c0 100644 --- a/pkg/plugin/nvidia/server.go +++ b/pkg/plugin/nvidia/server.go @@ -188,7 +188,17 @@ func (m *NvidiaDevicePlugin) DevicesNum() int { func (m *NvidiaDevicePlugin) Serve() error { sock, err := net.Listen("unix", m.socket) if err != nil { - return err + log.Printf("Listen sock fail and retry for '%s': %s", m.resourceName, err) + err = os.Remove(m.socket) + if err != nil { + log.Printf("Error deleting file: %s, %v\n", m.socket, err) + return err + } + sock, err = net.Listen("unix", m.socket) + if err != nil { + log.Printf("Retry Listen sock fail '%s': %s", m.resourceName, err) + return err + } } pluginapi.RegisterDevicePluginServer(m.server, m) @@ -343,6 +353,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Alloc } sort.Sort(availablePods) + util.UseClient(m.kubeInteractor.clientset) var candidatePod *v1.Pod for _, pod := range availablePods { @@ -406,7 +417,6 @@ Allocate: return nil, fmt.Errorf("failed to update pod annotation %v", err) } - util.UseClient(m.kubeInteractor.clientset) klog.V(3).Infoln("Releasing lock: nodeName=", m.kubeInteractor.nodeName) err = util.ReleaseNodeLock(m.kubeInteractor.nodeName, "gpu") if err != nil {