AUTH-4699, AUTH-8460, TUN-10179: Fix .lock file deletion race condition
Check / check (1.22.x, macos-latest) (push) Has been cancelled
Check / check (1.22.x, ubuntu-latest) (push) Has been cancelled
Check / check (1.22.x, windows-latest) (push) Has been cancelled
Semgrep config / semgrep/ci (push) Has been cancelled

Replace the lock file mechanism with PID+start-time based stale
detection so that no cleanup is required on process death.

When both org and app token locks were held, the first signal handler
to call os.Exit() would kill the process before the second handler
could delete its lock file. The orphaned lock file then caused the
next invocation to wait ~128 seconds in an exponential backoff loop
before forcibly deleting it. The same issue occurred on SIGKILL, OOM,
or any non-signal death.

Lock files now contain the holder's PID and process start time as
JSON. On acquisition, if a lock file already exists, the recorded
process is checked for liveness via gopsutil. Stale locks are
reclaimed immediately with no backoff. Atomic O_CREATE|O_EXCL
prevents races between concurrent acquirers.

Also adds a companion .url file so processes waiting on an active
lock can print the auth URL for the user.
This commit is contained in:
Evan Raw
2026-04-21 17:41:00 -05:00
parent 23b15d0eb6
commit da81fb02ec
5 changed files with 293 additions and 151 deletions
+2 -1
View File
@@ -100,6 +100,7 @@ func login(c *cli.Context) error {
c.Bool(cfdflags.AutoCloseInterstitial),
isFEDRamp,
log,
"",
)
if err != nil {
log.Error().Err(err).Msgf("Failed to write the certificate.\n\nYour browser will download the certificate instead. You will have to manually\ncopy it to the following path:\n\n%s\n", path)
@@ -122,7 +123,7 @@ func login(c *cli.Context) error {
return err
}
if err := os.WriteFile(path, resourceData, 0600); err != nil {
if err := os.WriteFile(path, resourceData, 0600); err != nil { // nolint: gosec
return errors.Wrap(err, fmt.Sprintf("error writing cert to %s", path))
}
+114
View File
@@ -0,0 +1,114 @@
package token
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestIsLockFileStale_DeadProcess(t *testing.T) {
// write a lock file with a PID that cannot exist (e.g., max int32)
dir := t.TempDir()
path := filepath.Join(dir, "test.lock")
content := lockContent{PID: 2147483647, StartTime: 1000000000000}
data, err := json.Marshal(content)
require.NoError(t, err)
require.NoError(t, os.WriteFile(path, data, 0600))
stale, _, err := isLockFileStale(path)
require.NoError(t, err)
assert.True(t, stale)
}
func TestIsLockFileStale_LiveProcess(t *testing.T) {
// write a lock file with our own PID and start time
dir := t.TempDir()
path := filepath.Join(dir, "test.lock")
content, err := newSelfLockContent()
require.NoError(t, err)
data, err := json.Marshal(content)
require.NoError(t, err)
require.NoError(t, os.WriteFile(path, data, 0600))
stale, readBack, err := isLockFileStale(path)
require.NoError(t, err)
assert.False(t, stale)
assert.Equal(t, content.PID, readBack.PID)
assert.Equal(t, content.StartTime, readBack.StartTime)
}
func TestIsLockFileStale_EmptyFile(t *testing.T) {
// backward compat: old lock files are empty
dir := t.TempDir()
path := filepath.Join(dir, "test.lock")
require.NoError(t, os.WriteFile(path, []byte{}, 0600))
stale, _, err := isLockFileStale(path)
require.NoError(t, err)
assert.True(t, stale)
}
func TestIsLockFileStale_CorruptFile(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "test.lock")
require.NoError(t, os.WriteFile(path, []byte("not json"), 0600))
stale, _, err := isLockFileStale(path)
require.NoError(t, err)
assert.True(t, stale)
}
func TestReadAuthURL_Exists(t *testing.T) {
dir := t.TempDir()
tokenPath := filepath.Join(dir, "token")
url := "https://example.com/cdn-cgi/access/cli?token=abc123"
require.NoError(t, os.WriteFile(tokenPath+".url", []byte(url), 0600))
assert.Equal(t, url, readAuthURL(tokenPath))
}
func TestReadAuthURL_NotExists(t *testing.T) {
dir := t.TempDir()
tokenPath := filepath.Join(dir, "token")
assert.Empty(t, readAuthURL(tokenPath))
}
func TestTryCreateLockFile_Success(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "test.lock")
err := tryCreateLockFile(path)
require.NoError(t, err)
// verify the file contains valid JSON with our PID
data, err := os.ReadFile(path) // nolint: gosec
require.NoError(t, err)
var content lockContent
require.NoError(t, json.Unmarshal(data, &content))
assert.Equal(t, int32(os.Getpid()), content.PID) // nolint: gosec
assert.Positive(t, content.StartTime)
}
func TestTryCreateLockFile_AlreadyExists(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "test.lock")
require.NoError(t, tryCreateLockFile(path))
// second create should fail with "already exists"
err := tryCreateLockFile(path)
require.Error(t, err)
assert.True(t, os.IsExist(err))
}
func TestNewSelfLockContent(t *testing.T) {
content, err := newSelfLockContent()
require.NoError(t, err)
assert.Equal(t, int32(os.Getpid()), content.PID) // nolint: gosec
assert.Positive(t, content.StartTime)
}
-54
View File
@@ -1,54 +0,0 @@
//go:build linux || darwin
package token
import (
"os"
"syscall"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSignalHandler(t *testing.T) {
sigHandler := signalHandler{signals: []os.Signal{syscall.SIGUSR1}}
handlerRan := false
done := make(chan struct{})
timer := time.NewTimer(time.Second)
sigHandler.register(func() {
handlerRan = true
done <- struct{}{}
})
p, err := os.FindProcess(os.Getpid())
require.Nil(t, err)
p.Signal(syscall.SIGUSR1)
// Blocks for up to one second to make sure the handler callback runs before the assert.
select {
case <-done:
assert.True(t, handlerRan)
case <-timer.C:
t.Fail()
}
sigHandler.deregister()
}
func TestSignalHandlerClose(t *testing.T) {
sigHandler := signalHandler{signals: []os.Signal{syscall.SIGUSR1}}
done := make(chan struct{})
timer := time.NewTimer(time.Second)
sigHandler.register(func() { done <- struct{}{} })
sigHandler.deregister()
p, err := os.FindProcess(os.Getpid())
require.Nil(t, err)
p.Signal(syscall.SIGUSR1)
select {
case <-done:
t.Fail()
case <-timer.C:
}
}
+166 -93
View File
@@ -1,23 +1,18 @@
package token
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"os"
"os/signal"
"strings"
"syscall"
"time"
"github.com/go-jose/go-jose/v4"
"github.com/pkg/errors"
"github.com/rs/zerolog"
"github.com/cloudflare/cloudflared/config"
"github.com/cloudflare/cloudflared/retry"
"github.com/shirou/gopsutil/v4/process"
)
const (
@@ -41,15 +36,10 @@ type AppInfo struct {
AppDomain string
}
type lock struct {
lockFilePath string
backoff *retry.BackoffHandler
sigHandler *signalHandler
}
type signalHandler struct {
sigChannel chan os.Signal
signals []os.Signal
// lockContent is the JSON structure written into lock files.
type lockContent struct {
PID int32 `json:"pid"`
StartTime int64 `json:"start_time"`
}
type jwtPayload struct {
@@ -100,83 +90,174 @@ func (p jwtPayload) isExpired() bool {
return int(time.Now().Unix()) > p.Exp
}
func (s *signalHandler) register(handler func()) {
s.sigChannel = make(chan os.Signal, 1)
signal.Notify(s.sigChannel, s.signals...)
go func(s *signalHandler) {
for range s.sigChannel {
handler()
const (
lockRetryInterval = 2 * time.Second
lockTimeout = 10 * time.Minute
startTimeTolerance = int64(1000) // milliseconds
)
// acquireLockFile loops until it successfully creates a lock file for the
// given token file path. The lock file is created at tokenPath + ".lock".
//
// On each iteration:
// 1. Try to create the file atomically with O_CREATE|O_EXCL.
// If that succeeds, write our PID + start time and return nil.
// 2. If the file already exists, read it and check whether the owning
// process is still alive (PID exists and start time matches).
// 3. If the owner is alive, sleep for lockRetryInterval and retry.
// 4. If the owner is dead (stale lock), remove the file and immediately
// retry the O_EXCL create. No sleep (the atomic create is the
// tiebreaker if multiple processes race to reclaim).
func acquireLockFile(tokenPath string, log *zerolog.Logger) error {
lockPath := tokenPath + ".lock"
deadline := time.Now().Add(lockTimeout)
lastURL := ""
for {
if time.Now().After(deadline) {
return fmt.Errorf("timed out waiting for lock file %s", lockPath)
}
err := tryCreateLockFile(lockPath)
if err == nil {
log.Debug().Str("path", lockPath).Msg("lock file acquired")
return nil
}
if !os.IsExist(err) {
return errors.Wrapf(err, "failed to create lock file %s", lockPath)
}
}(s)
}
func (s *signalHandler) deregister() {
signal.Stop(s.sigChannel)
close(s.sigChannel)
}
func errDeleteTokenFailed(lockFilePath string) error {
return fmt.Errorf("failed to acquire a new Access token. Please try to delete %s", lockFilePath)
}
// newLock will get a new file lock
func newLock(path string) *lock {
lockPath := path + ".lock"
backoff := retry.NewBackoff(uint(7), retry.DefaultBaseTime, false)
return &lock{
lockFilePath: lockPath,
backoff: &backoff,
sigHandler: &signalHandler{
signals: []os.Signal{syscall.SIGINT, syscall.SIGTERM},
},
}
}
func (l *lock) Acquire() error {
// Intercept SIGINT and SIGTERM to release lock before exiting
l.sigHandler.register(func() {
_ = l.deleteLockFile()
os.Exit(0)
})
// Check for a lock file
// if the lock file exists; start polling
// if not, create the lock file and go through the normal flow.
// See AUTH-1736 for the reason why we do all this
for isTokenLocked(l.lockFilePath) {
if l.backoff.Backoff(context.Background()) {
// lock file exists, so check if the owner is still alive
stale, content, checkErr := isLockFileStale(lockPath)
if checkErr != nil {
// file may be mid-write by another racer, or was removed
// between our O_EXCL attempt and this read
log.Debug().Err(checkErr).Str("path", lockPath).
Msg("could not read lock file, retrying")
time.Sleep(lockRetryInterval)
continue
}
if err := l.deleteLockFile(); err != nil {
return err
if !stale {
// try to display the auth URL so the user can open a browser
// manually if the original window is not visible
if authURL := readAuthURL(tokenPath); authURL != "" && authURL != lastURL {
fmt.Fprintf(os.Stderr, "\nAnother cloudflared process (pid %d) "+
"is already waiting for authentication.\n\n"+
"If a browser window did not open, please visit "+
"the following URL:\n\n%s\n\n", content.PID, authURL)
lastURL = authURL
}
log.Debug().Str("path", lockPath).
Msg("lock file is held by another process, retrying")
time.Sleep(lockRetryInterval)
continue
}
// stale, so remove and immediately retry
log.Debug().Str("path", lockPath).Int32("stale_pid", content.PID).
Msg("reclaiming stale lock file")
if removeErr := os.Remove(lockPath); removeErr != nil && !os.IsNotExist(removeErr) {
log.Debug().Err(removeErr).Str("path", lockPath).
Msg("could not remove stale lock file, retrying")
time.Sleep(lockRetryInterval)
continue
}
}
}
// Create a lock file so other processes won't also try to get the token at
// the same time
if err := os.WriteFile(l.lockFilePath, []byte{}, 0600); err != nil {
// readAuthURL reads the auth URL companion file for the given token path.
// Returns the URL string, or empty string if the file doesn't exist or
// can't be read.
func readAuthURL(tokenPath string) string {
data, err := os.ReadFile(tokenPath + ".url") // nolint: gosec
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}
// tryCreateLockFile atomically creates the lock file using O_CREATE|O_EXCL
// and writes the current process's PID and start time into it as JSON.
// The file is created with 0600 permissions (owner read/write only).
func tryCreateLockFile(path string) (retErr error) {
f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600) // nolint: gosec
if err != nil {
return err
}
return nil
}
defer func() {
if retErr != nil {
_ = f.Close()
_ = os.Remove(path)
return
}
retErr = f.Close()
}()
func (l *lock) deleteLockFile() error {
if err := os.Remove(l.lockFilePath); err != nil && !os.IsNotExist(err) {
return errDeleteTokenFailed(l.lockFilePath)
content, err := newSelfLockContent()
if err != nil {
return err
}
return nil
return json.NewEncoder(f).Encode(content)
}
func (l *lock) Release() error {
defer l.sigHandler.deregister()
return l.deleteLockFile()
// newSelfLockContent returns a lockContent describing the current process.
func newSelfLockContent() (lockContent, error) {
pid := int32(os.Getpid()) // nolint: gosec
p, err := process.NewProcess(pid)
if err != nil {
return lockContent{}, fmt.Errorf("failed to look up own process: %w", err)
}
ct, err := p.CreateTime()
if err != nil {
return lockContent{}, fmt.Errorf("failed to get own start time: %w", err)
}
return lockContent{PID: pid, StartTime: ct}, nil
}
// isTokenLocked checks to see if there is another process attempting to get the token already
func isTokenLocked(lockFilePath string) bool {
exists, err := config.FileExists(lockFilePath)
return exists && err == nil
// isLockFileStale reads the lock file and checks whether the owning process
// is dead or has a mismatched start time. Returns (true, content, nil) if
// stale, (false, content, nil) if actively held, or an error if the file
// cannot be read.
func isLockFileStale(path string) (bool, lockContent, error) {
data, err := os.ReadFile(path) // nolint: gosec
if err != nil {
return false, lockContent{}, err
}
var content lockContent
if err := json.Unmarshal(data, &content); err != nil {
// corrupt or empty file (treat as stale)
return true, lockContent{}, nil
}
p, err := process.NewProcess(content.PID)
if err != nil {
return true, content, nil // process does not exist
}
// CreateTime reads /proc/{pid}/stat on Linux (world-readable, always works).
// On Windows and macOS it can fail for processes owned by a different user,
// but cloudflared instances sharing a lock file are always running as the
// same user (the lock directory is derived from ~ via go-homedir).
ct, err := p.CreateTime()
if err != nil {
return true, content, nil // cannot query process (treat as stale)
}
diff := ct - content.StartTime
if diff < 0 {
diff = -diff
}
if diff > startTimeTolerance {
return true, content, nil // PID was recycled (different process)
}
// If the lock file is older than lockTimeout, the auth flow is
// definitely complete and the process is no longer doing auth work.
info, err := os.Stat(path)
if err == nil && time.Since(info.ModTime()) > lockTimeout {
return true, content, nil
}
return false, content, nil // process is alive and actively authenticating
}
func Init(version string) {
@@ -206,13 +287,9 @@ func getToken(appURL *url.URL, appInfo *AppInfo, useHostOnly bool, autoClose boo
return "", errors.Wrap(err, "failed to generate app token file path")
}
fileLockAppToken := newLock(appTokenPath)
if err = fileLockAppToken.Acquire(); err != nil {
if err = acquireLockFile(appTokenPath, log); err != nil {
return "", errors.Wrap(err, "failed to acquire app token lock")
}
defer func() {
_ = fileLockAppToken.Release()
}()
// check to see if another process has gotten a token while we waited for the lock
if token, err := GetAppTokenIfExists(appInfo); token != "" && err == nil {
@@ -228,13 +305,9 @@ func getToken(appURL *url.URL, appInfo *AppInfo, useHostOnly bool, autoClose boo
return "", errors.Wrap(err, "failed to generate org token file path")
}
fileLockOrgToken := newLock(orgTokenPath)
if err = fileLockOrgToken.Acquire(); err != nil {
if err = acquireLockFile(orgTokenPath, log); err != nil {
return "", errors.Wrap(err, "failed to acquire org token lock")
}
defer func() {
_ = fileLockOrgToken.Release()
}()
// check if an org token has been created since the lock was acquired
orgToken, err = GetOrgTokenIfExists(appInfo.AuthDomain)
}
@@ -243,7 +316,7 @@ func getToken(appURL *url.URL, appInfo *AppInfo, useHostOnly bool, autoClose boo
log.Debug().Msgf("failed to exchange org token for app token: %s", err)
} else {
// generate app path
if err := os.WriteFile(appTokenPath, []byte(appToken), 0600); err != nil {
if err := os.WriteFile(appTokenPath, []byte(appToken), 0600); err != nil { // nolint: gosec
return "", errors.Wrap(err, "failed to write app token to disk")
}
return appToken, nil
@@ -260,7 +333,7 @@ func getTokensFromEdge(appURL *url.URL, appAUD, appTokenPath, orgTokenPath strin
// this weird parameter is the resource name (token) and the key/value
// we want to send to the transfer service. the key is token and the value
// is blank (basically just the id generated in the transfer service)
resourceData, err := RunTransfer(appURL, appAUD, keyName, keyName, "", true, useHostOnly, autoClose, isFedramp, log)
resourceData, err := RunTransfer(appURL, appAUD, keyName, keyName, "", true, useHostOnly, autoClose, isFedramp, log, appTokenPath+".url")
if err != nil {
return "", errors.Wrap(err, "failed to run transfer service")
}
@@ -303,11 +376,11 @@ func GetAppInfo(reqURL *url.URL) (*AppInfo, error) {
return nil, errors.Wrap(err, "failed to create app info request")
}
appInfoReq.Header.Add("User-Agent", userAgent)
resp, err := client.Do(appInfoReq)
resp, err := client.Do(appInfoReq) // nolint: gosec
if err != nil {
return nil, errors.Wrap(err, "failed to get app info")
}
resp.Body.Close()
_ = resp.Body.Close()
var aud string
location := resp.Request.URL
@@ -374,11 +447,11 @@ func exchangeOrgToken(appURL *url.URL, orgToken string) (string, error) {
return "", errors.Wrap(err, "failed to create app token request")
}
appTokenRequest.Header.Add("User-Agent", userAgent)
resp, err := client.Do(appTokenRequest)
resp, err := client.Do(appTokenRequest) // nolint: gosec
if err != nil {
return "", errors.Wrap(err, "failed to get app token")
}
resp.Body.Close()
_ = resp.Body.Close()
var appToken string
for _, c := range resp.Cookies() {
//if Org token revoked on exchange, getTokensFromEdge instead
@@ -441,7 +514,7 @@ func GetAppTokenIfExists(appInfo *AppInfo) (string, error) {
// GetTokenIfExists will return the token from local storage if it exists and not expired
func getTokenIfExists(path string) (*jose.JSONWebSignature, error) {
content, err := os.ReadFile(path)
content, err := os.ReadFile(path) // nolint: gosec
if err != nil {
return nil, err
}
+11 -3
View File
@@ -26,7 +26,10 @@ const (
// The "dance" we refer to is building a HTTP request, opening that in a browser waiting for
// the user to complete an action, while it long polls in the background waiting for an
// action to be completed to download the resource.
func RunTransfer(transferURL *url.URL, appAUD, resourceName, key, value string, shouldEncrypt bool, useHostOnly bool, autoClose bool, fedramp bool, log *zerolog.Logger) ([]byte, error) {
//
// If urlFilePath is non-empty, the generated auth URL is written to that path so
// other waiting processes can display it to the user. Pass "" to skip.
func RunTransfer(transferURL *url.URL, appAUD, resourceName, key, value string, shouldEncrypt bool, useHostOnly bool, autoClose bool, fedramp bool, log *zerolog.Logger, urlFilePath string) ([]byte, error) {
encrypterClient, err := NewEncrypter("cloudflared_priv.pem", "cloudflared_pub.pem")
if err != nil {
return nil, err
@@ -36,6 +39,11 @@ func RunTransfer(transferURL *url.URL, appAUD, resourceName, key, value string,
return nil, err
}
// write auth URL to companion file so other waiting processes can display it
if urlFilePath != "" {
_ = os.WriteFile(urlFilePath, []byte(requestURL), 0600) // nolint: gosec
}
// See AUTH-1423 for why we use stderr (the way git wraps ssh)
err = OpenBrowser(requestURL)
if err != nil {
@@ -129,11 +137,11 @@ func poll(client *http.Client, requestURL string, log *zerolog.Logger) ([]byte,
return nil, "", err
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
resp, err := client.Do(req) // nolint: gosec
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
defer func() { _ = resp.Body.Close() }()
// ignore everything other than server errors as the resource
// may not exist until the user does the interaction