Skip to content

fix: stop tearing down non-TTY processes on SSH session end #18673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion agent/agentssh/agentssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ type Server struct {
listeners map[net.Listener]struct{}
conns map[net.Conn]struct{}
sessions map[ssh.Session]struct{}
processes map[*os.Process]struct{}
closing chan struct{}
// Wait for goroutines to exit, waited without
// a lock on mu but protected by closing.
Expand Down Expand Up @@ -188,6 +189,7 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom
fs: fs,
conns: make(map[net.Conn]struct{}),
sessions: make(map[ssh.Session]struct{}),
processes: make(map[*os.Process]struct{}),
logger: logger,

config: config,
Expand Down Expand Up @@ -606,7 +608,10 @@ func (s *Server) startNonPTYSession(logger slog.Logger, session ssh.Session, mag
// otherwise context cancellation will not propagate properly
// and SSH server close may be delayed.
cmd.SysProcAttr = cmdSysProcAttr()
cmd.Cancel = cmdCancel(session.Context(), logger, cmd)

// to match OpenSSH, we don't actually tear a non-TTY command down, even if the session ends.
// c.f. https://github.com/coder/coder/issues/18519#issuecomment-3019118271
cmd.Cancel = nil

cmd.Stdout = session
cmd.Stderr = session.Stderr()
Expand All @@ -629,6 +634,16 @@ func (s *Server) startNonPTYSession(logger slog.Logger, session ssh.Session, mag
s.metrics.sessionErrors.WithLabelValues(magicTypeLabel, "no", "start_command").Add(1)
return xerrors.Errorf("start: %w", err)
}

// Since we don't cancel the process when the session stops, we still need to tear it down if we are closing. So
// track it here.
if !s.trackProcess(cmd.Process, true) {
// must be closing
err = cmdCancel(logger, cmd.Process)
return xerrors.Errorf("failed to track process: %w", err)
}
defer s.trackProcess(cmd.Process, false)

sigs := make(chan ssh.Signal, 1)
session.Signals(sigs)
defer func() {
Expand Down Expand Up @@ -1089,6 +1104,27 @@ func (s *Server) trackSession(ss ssh.Session, add bool) (ok bool) {
return true
}

// trackCommand registers the process with the server. If the server is
// closing, the process is not registered and should be closed.
//
//nolint:revive
func (s *Server) trackProcess(p *os.Process, add bool) (ok bool) {
s.mu.Lock()
defer s.mu.Unlock()
if add {
if s.closing != nil {
// Server closed.
return false
}
s.wg.Add(1)
s.processes[p] = struct{}{}
return true
}
s.wg.Done()
delete(s.processes, p)
return true
}

// Close the server and all active connections. Server can be re-used
// after Close is done.
func (s *Server) Close() error {
Expand Down Expand Up @@ -1128,6 +1164,10 @@ func (s *Server) Close() error {
_ = c.Close()
}

for p := range s.processes {
_ = cmdCancel(s.logger, p)
}

s.logger.Debug(ctx, "closing SSH server")
err := s.srv.Close()

Expand Down
10 changes: 4 additions & 6 deletions agent/agentssh/exec_other.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ package agentssh

import (
"context"
"os/exec"
"os"
"syscall"

"cdr.dev/slog"
Expand All @@ -16,9 +16,7 @@ func cmdSysProcAttr() *syscall.SysProcAttr {
}
}

func cmdCancel(ctx context.Context, logger slog.Logger, cmd *exec.Cmd) func() error {
return func() error {
logger.Debug(ctx, "cmdCancel: sending SIGHUP to process and children", slog.F("pid", cmd.Process.Pid))
return syscall.Kill(-cmd.Process.Pid, syscall.SIGHUP)
}
func cmdCancel(logger slog.Logger, p *os.Process) error {
logger.Debug(context.Background(), "cmdCancel: sending SIGHUP to process and children", slog.F("pid", p.Pid))
return syscall.Kill(-p.Pid, syscall.SIGHUP)
}
20 changes: 9 additions & 11 deletions agent/agentssh/exec_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package agentssh

import (
"context"
"os/exec"
"os"
"syscall"

"cdr.dev/slog"
Expand All @@ -12,14 +12,12 @@ func cmdSysProcAttr() *syscall.SysProcAttr {
return &syscall.SysProcAttr{}
}

func cmdCancel(ctx context.Context, logger slog.Logger, cmd *exec.Cmd) func() error {
return func() error {
logger.Debug(ctx, "cmdCancel: killing process", slog.F("pid", cmd.Process.Pid))
// Windows doesn't support sending signals to process groups, so we
// have to kill the process directly. In the future, we may want to
// implement a more sophisticated solution for process groups on
// Windows, but for now, this is a simple way to ensure that the
// process is terminated when the context is cancelled.
return cmd.Process.Kill()
}
func cmdCancel(logger slog.Logger, p *os.Process) error {
logger.Debug(context.Background(), "cmdCancel: killing process", slog.F("pid", p.Pid))
// Windows doesn't support sending signals to process groups, so we
// have to kill the process directly. In the future, we may want to
// implement a more sophisticated solution for process groups on
// Windows, but for now, this is a simple way to ensure that the
// process is terminated when the context is cancelled.
return p.Kill()
}
Loading