fix(cloud): resolve read-only container tools in one shot (no extra LLM round-trip) (#4767)

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 04:10:12 +00:00 · 2026-06-01 07:59:19 -07:00
parent 51e13d5f97
commit 8811dc82bd
7 changed files with 568 additions and 38 deletions
@@ -67,7 +67,7 @@ var (
 		Properties: map[string]paramProperty{},
 	})

-	containerIDParam = paramProperty{Type: "string", Description: "Container name or ID. You can pass the container name directly (as shown in logs, events, and find_containers) — it does not need to be the opaque ID. Resolved by exact name first, then ID, then a unique name substring. If the name matches more than one container the call fails with the list of candidates so you can disambiguate."}
+	containerIDParam = paramProperty{Type: "string", Description: "Container name or ID. You can pass the container name directly (as shown in logs, events, and find_containers) — it does not need to be the opaque ID. Resolved by exact name first, then ID, then a unique name substring. When a name matches several containers (e.g. a Swarm service with stopped task corpses, or multiple replicas), read-only tools (inspect/logs) resolve to the most relevant one — the running replica, or the most-recently-active if all are stopped — and tell you which one (and its siblings) in the result, so you don't need to look up the ID first. Write tools (start/stop/restart/remove/update) never guess between live containers: an ambiguous name fails with the candidate list so you can re-issue with an exact ID."}
 	hostIDParam      = paramProperty{Type: "string", Description: "Host name or ID (from list_hosts or find_containers). Optional — omit it when the container name is unique across all hosts; supply it (name or ID) only to scope to a specific host when a name is ambiguous."}
 	boolFalse        = false

@@ -168,7 +168,11 @@ func executeInspectContainer(argsJSON string, deps ToolDeps) (*pb.CallToolRespon
 	if err := json.Unmarshal([]byte(argsJSON), &args); err != nil {
 		return nil, fmt.Errorf("failed to parse arguments: %w", err)
 	}
-	hostID, containerID, err := resolveContainerRef(args.ContainerID, args.Host, deps)
+	// Read-only: resolve an ambiguous name in one shot rather than erroring and
+	// forcing a find_containers round-trip. The note is discarded here because
+	// inspect already returns the concrete id/name/state, which is itself the
+	// disambiguation — no need to mangle those structured fields with prose.
+	hostID, containerID, _, err := resolveContainerRefRead(args.ContainerID, args.Host, deps)
 	if err != nil {
 		return nil, err
 	}
@@ -27,7 +27,7 @@ func executeFetchContainerLogs(ctx context.Context, argsJSON string, deps ToolDe
 	if err := json.Unmarshal([]byte(argsJSON), &args); err != nil {
 		return nil, fmt.Errorf("failed to parse arguments: %w", err)
 	}
-	hostID, containerID, err := resolveContainerRef(args.ContainerID, args.Host, deps)
+	hostID, containerID, note, err := resolveContainerRefRead(args.ContainerID, args.Host, deps)
 	if err != nil {
 		return nil, err
 	}
@@ -93,6 +93,17 @@ func executeFetchContainerLogs(ctx context.Context, argsJSON string, deps ToolDe

 	return &pb.CallToolResponse{
 		Success: true,
-		Result:  &pb.CallToolResponse_FetchLogs{FetchLogs: &pb.FetchLogsResult{ContainerName: cs.Container.Name, Entries: entries}},
+		Result:  &pb.CallToolResponse_FetchLogs{FetchLogs: &pb.FetchLogsResult{ContainerName: withResolutionNote(cs.Container.Name, note), Entries: entries}},
 	}, nil
 }
+
+// withResolutionNote appends the read-path resolution note (if any) to a
+// model-visible identity string, so the model learns which container an
+// ambiguous name resolved to — and that siblings exist — in the same turn,
+// without a round-trip. Empty note returns name unchanged.
+func withResolutionNote(name, note string) string {
+	if note == "" {
+		return name
+	}
+	return name + " " + note
+}
@@ -11,6 +11,14 @@ import (
 // id) plus an optional host reference (a name OR an id) into the concrete
 // (hostID, containerID) pair that HostService.FindContainer expects.
 //
+// This is the STRICT (write) resolver used by the destructive tools
+// (start/stop/restart/remove/update). It NEVER picks between multiple live
+// containers — acting on the wrong one is irreversible — so a name that matches
+// more than one container resolves only when exactly one candidate is running
+// (the others being stopped task corpses); otherwise it fails with the
+// candidate listing. Read-only tools use resolveContainerRefRead, which is
+// allowed to pick the most-relevant container in one shot (see that function).
+//
 // LLMs almost always have only the container name (it is what logs, events and
 // listings surface), so every container-scoped tool funnels through here rather
 // than passing the raw reference straight to FindContainer — which only matches
@@ -23,32 +31,128 @@ import (
 //  3. unique substring of the name
 //
 // The first tier that yields any candidates wins. An id match is always unique,
-// so it is never treated as ambiguous. If a name tier yields more than one
-// container the call fails with an error listing every candidate (name + id +
-// host) so the caller can disambiguate — we NEVER silently pick one. This is
-// what makes the write tools (stop/restart/remove/update) safe to drive by name.
+// so it is never treated as ambiguous. When a name tier yields more than one
+// container we try to break the tie by running state: if exactly one candidate
+// is running it is the unambiguous live referent (the others are stopped task
+// corpses left by Swarm redeploys / restart churn) and we resolve to it. Only
+// when the tie is real — zero running, or several live containers — does the
+// call fail with an error listing every candidate (name + id + state + host) so
+// the caller can disambiguate. We NEVER pick between multiple *live* containers,
+// which is what keeps the write tools (stop/restart/remove/update) safe by name.
 //
 // hostRef is optional: when empty the container must resolve unambiguously
 // across all hosts; when supplied it scopes the search to that host (matched by
 // id or name).
 func resolveContainerRef(containerRef, hostRef string, deps ToolDeps) (hostID, containerID string, err error) {
+	tier, trimmedHostRef, scopedHostID, hostNames, err := matchContainerTier(containerRef, hostRef, deps)
+	if err != nil {
+		return "", "", err
+	}
+
+	switch len(tier) {
+	case 0:
+		// No tier matched. When an explicit host was given (and resolved to a
+		// real host id) but the listing produced no match — e.g. a host returned
+		// a partial error and was omitted from ListAllContainers — pass the
+		// reference straight through to FindContainer's direct lookup, exactly as
+		// before this resolver existed. This guarantees id-based callers never
+		// regress.
+		if scopedHostID != "" {
+			return scopedHostID, strings.TrimSpace(containerRef), nil
+		}
+		return "", "", fmt.Errorf("no container matching %q found across all connected hosts; call find_containers to list available containers", strings.TrimSpace(containerRef))
+	case 1:
+		return tier[0].Host, tier[0].ID, nil
+	default:
+		// Multiple matches. The usual benign cause is Docker Swarm (or plain
+		// restart churn) leaving stopped task containers behind across redeploys
+		// — the short name "svc.1" substring-matches every historical
+		// "svc.1.<taskid>" corpse alongside the one live task. When exactly one
+		// candidate is running it is the unambiguous live referent, so resolve to
+		// it rather than make the caller hunt for an id. We still refuse to
+		// choose between multiple *live* containers — that is the real ambiguity
+		// the write tools must never guess at.
+		if live := runningContainers(tier); len(live) == 1 {
+			return live[0].Host, live[0].ID, nil
+		}
+		return "", "", ambiguousError(strings.TrimSpace(containerRef), trimmedHostRef, tier, hostNames)
+	}
+}
+
+// resolveContainerRefRead is the PERMISSIVE (read-only) resolver used by the
+// non-destructive tools (inspect_container, fetch_container_logs, stream_logs).
+//
+// It matches identically to resolveContainerRef, but when a name is ambiguous
+// it does NOT error — it resolves to the single most-relevant candidate in one
+// shot, eliminating the find-then-act round-trip that otherwise costs a whole
+// extra LLM call on the investigation path. The pick is safe precisely because
+// the tool is read-only: reading the wrong replica's logs is cheap and
+// recoverable, and same-service replicas are interchangeable for diagnosis,
+// whereas restarting the wrong one is not. The strict resolver above keeps the
+// write tools honest.
+//
+// Selection (bestCandidate): running containers beat stopped ones, then newest
+// StartedAt, then most-recently-finished/created. For the crash-loop case —
+// every "svc.1.*" task exited — this lands on the corpse that died most
+// recently, which is what the investigation is usually about.
+//
+// When it picks among several candidates it returns a one-line note naming the
+// chosen container and its siblings. The caller surfaces that note in the tool
+// RESULT (not via a round-trip), so the model gets the disambiguation context
+// for free in the same turn and can re-call with an explicit id if it genuinely
+// wants a different replica — but it is never forced to. note is empty when the
+// match was unique (nothing to disclose).
+func resolveContainerRefRead(containerRef, hostRef string, deps ToolDeps) (hostID, containerID, note string, err error) {
+	tier, _, scopedHostID, hostNames, err := matchContainerTier(containerRef, hostRef, deps)
+	if err != nil {
+		return "", "", "", err
+	}
+
+	switch len(tier) {
+	case 0:
+		if scopedHostID != "" {
+			return scopedHostID, strings.TrimSpace(containerRef), "", nil
+		}
+		return "", "", "", fmt.Errorf("no container matching %q found across all connected hosts; call find_containers to list available containers", strings.TrimSpace(containerRef))
+	case 1:
+		return tier[0].Host, tier[0].ID, "", nil
+	default:
+		// Exactly one running candidate is unambiguous — the corpses are never a
+		// valid target — so resolve cleanly with no note, identical to the strict
+		// resolver. A note is only warranted when the read path actually exercises
+		// its relaxation: zero running (pick the newest corpse) or several running
+		// replicas (pick the newest live one).
+		if live := runningContainers(tier); len(live) == 1 {
+			return live[0].Host, live[0].ID, "", nil
+		}
+		best := bestCandidate(tier)
+		note = resolutionNote(strings.TrimSpace(containerRef), best, tier, hostNames)
+		return best.Host, best.ID, note, nil
+	}
+}
+
+// matchContainerTier runs the shared scoping + tiered matching used by both
+// resolvers and returns the winning tier (the first non-empty of exact-id,
+// exact-name, substring). It also returns the trimmed hostRef, the resolved
+// scoped host id (empty when no host was supplied), and the host-name map for
+// building human-readable notes/errors. An empty returned tier means no match.
+func matchContainerTier(containerRef, hostRef string, deps ToolDeps) (tier []container.Container, trimmedHostRef, scopedHostID string, hostNames map[string]string, err error) {
 	containerRef = strings.TrimSpace(containerRef)
 	if containerRef == "" {
-		return "", "", fmt.Errorf("container_id is required")
+		return nil, "", "", nil, fmt.Errorf("container_id is required")
 	}

 	containers, errs := deps.HostService.ListAllContainers(deps.Labels)
 	logHostErrors(errs)
-	hostNames := buildHostNameMap(deps.HostService)
+	hostNames = buildHostNameMap(deps.HostService)

 	// Scope to a host if one was supplied. The host reference may be an id or a
 	// name; an unknown host is an explicit error rather than a silent no-match.
-	hostRef = strings.TrimSpace(hostRef)
-	var scopedHostID string
-	if hostRef != "" {
-		scopedHostID, err = resolveHostRef(hostRef, deps)
+	trimmedHostRef = strings.TrimSpace(hostRef)
+	if trimmedHostRef != "" {
+		scopedHostID, err = resolveHostRef(trimmedHostRef, deps)
 		if err != nil {
-			return "", "", err
+			return nil, trimmedHostRef, "", hostNames, err
 		}
 		filtered := containers[:0:0]
 		for _, c := range containers {
@@ -75,27 +179,12 @@ func resolveContainerRef(containerRef, hostRef string, deps ToolDeps) (hostID, c
 		}
 	}

-	for _, tier := range [][]container.Container{exactID, exactName, substring} {
-		switch len(tier) {
-		case 0:
-			continue
-		case 1:
-			return tier[0].Host, tier[0].ID, nil
-		default:
-			return "", "", ambiguousError(containerRef, hostRef, tier, hostNames)
+	for _, t := range [][]container.Container{exactID, exactName, substring} {
+		if len(t) > 0 {
+			return t, trimmedHostRef, scopedHostID, hostNames, nil
 		}
 	}
-
-	// Legacy fallback: when an explicit host was given (and resolved to a real
-	// host id) but the listing produced no match — e.g. a host returned a
-	// partial error and was omitted from ListAllContainers — pass the reference
-	// straight through to FindContainer's direct lookup, exactly as before this
-	// resolver existed. This guarantees id-based callers never regress.
-	if scopedHostID != "" {
-		return scopedHostID, containerRef, nil
-	}
-
-	return "", "", fmt.Errorf("no container matching %q found across all connected hosts; call find_containers to list available containers", containerRef)
+	return nil, trimmedHostRef, scopedHostID, hostNames, nil
 }

 // matchesID reports whether ref identifies the container id — either the full id
@@ -152,7 +241,7 @@ func ambiguousError(containerRef, hostRef string, candidates []container.Contain
 	parts := make([]string, len(candidates))
 	sameHost := true
 	for i, c := range candidates {
-		parts[i] = fmt.Sprintf("%s (id %s on host %s)", c.Name, shortID(c.ID), resolveHostName(c.Host, hostNames))
+		parts[i] = fmt.Sprintf("%s (id %s, %s, on host %s)", c.Name, shortID(c.ID), describeState(c), resolveHostName(c.Host, hostNames))
 		if c.Host != candidates[0].Host {
 			sameHost = false
 		}
@@ -170,6 +259,118 @@ func ambiguousError(containerRef, hostRef string, candidates []container.Contain
 	return fmt.Errorf("%q matches multiple containers: %s. To act on the right one, %s", containerRef, strings.Join(parts, "; "), hint)
 }

+// runningContainers returns only the candidates in the running state. It breaks
+// a multi-match tie down to the single live container when every other candidate
+// is a stopped corpse (the typical Swarm-redeploy / restart-churn case), which
+// is the one situation where picking among matches is unambiguous and safe.
+func runningContainers(cs []container.Container) []container.Container {
+	live := make([]container.Container, 0, len(cs))
+	for _, c := range cs {
+		if strings.EqualFold(c.State, "running") {
+			live = append(live, c)
+		}
+	}
+	return live
+}
+
+// bestCandidate picks the single most-relevant container from an ambiguous name
+// match for the read-only path. Ordering: running beats stopped, then newest
+// StartedAt, then most-recently-finished, then most-recently-created. For the
+// crash-loop case (every "svc.1.*" task exited) this lands on the corpse that
+// died most recently — the one the investigation is almost always about. The
+// caller guarantees len(cs) > 0.
+func bestCandidate(cs []container.Container) container.Container {
+	best := cs[0]
+	for _, c := range cs[1:] {
+		if moreRelevant(c, best) {
+			best = c
+		}
+	}
+	return best
+}
+
+// moreRelevant reports whether a should be preferred over b as the read-path
+// referent. Running always wins; among the same running-ness, the more-recently
+// active container wins (StartedAt, then FinishedAt, then Created).
+func moreRelevant(a, b container.Container) bool {
+	aRunning, bRunning := isRunning(a), isRunning(b)
+	if aRunning != bRunning {
+		return aRunning
+	}
+	if !a.StartedAt.Equal(b.StartedAt) {
+		return a.StartedAt.After(b.StartedAt)
+	}
+	if !a.FinishedAt.Equal(b.FinishedAt) {
+		return a.FinishedAt.After(b.FinishedAt)
+	}
+	return a.Created.After(b.Created)
+}
+
+func isRunning(c container.Container) bool {
+	return strings.EqualFold(c.State, "running")
+}
+
+// resolutionNote renders the one-line transparency note the read-only tools
+// surface in their result when a name resolved to one of several candidates. It
+// names the chosen container (state-annotated), summarizes how many siblings of
+// each running-ness exist, and lists the sibling names — so the model can re-call
+// with an explicit id if it wants a different replica, without ever being forced
+// to. Returned as a parenthetical prefix the tools prepend to a human-visible
+// field.
+func resolutionNote(containerRef string, chosen container.Container, candidates []container.Container, hostNames map[string]string) string {
+	running := len(runningContainers(candidates))
+
+	// Annotate sibling names with their host only when the candidates span more
+	// than one host — that is the case where the model needs the host to re-target
+	// a sibling; on a single host it would just be noise.
+	multiHost := false
+	for _, c := range candidates {
+		if c.Host != chosen.Host {
+			multiHost = true
+			break
+		}
+	}
+
+	siblings := make([]string, 0, len(candidates)-1)
+	for _, c := range candidates {
+		if c.ID == chosen.ID {
+			continue
+		}
+		if multiHost {
+			siblings = append(siblings, fmt.Sprintf("%s on host %s", c.Name, resolveHostName(c.Host, hostNames)))
+		} else {
+			siblings = append(siblings, c.Name)
+		}
+	}
+
+	var summary string
+	switch {
+	case running > 1 && isRunning(chosen):
+		summary = fmt.Sprintf("newest of %d running replicas", running)
+	case running == 0:
+		summary = fmt.Sprintf("most recently active of %d stopped containers", len(candidates))
+	default:
+		summary = fmt.Sprintf("1 of %d matching containers", len(candidates))
+	}
+
+	return fmt.Sprintf("(resolved %q → %s [%s], %s; others: %s)",
+		containerRef, chosen.Name, describeState(chosen), summary, strings.Join(siblings, ", "))
+}
+
+// describeState renders a candidate's state (with health when known) for the
+// ambiguity listing, so the caller can tell a live replica from a stopped corpse
+// and pick the right one. A blank state is reported as "unknown".
+func describeState(c container.Container) string {
+	state := c.State
+	if state == "" {
+		state = "unknown"
+	}
+	if c.Health != "" {
+		return fmt.Sprintf("%s (%s)", state, c.Health)
+	}
+	return state
+}
+
 // shortID trims a full docker id to its conventional 12-character form for
 // display in errors.
 func shortID(id string) string {
@@ -0,0 +1,239 @@
+package cloud
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/amir20/dozzle/internal/container"
+	container_support "github.com/amir20/dozzle/internal/support/container"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+)
+
+// The read path collapses the find-then-act round-trip: when a name matches
+// several same-service replicas/corpses, a read-only tool resolves to the
+// single most-relevant container (running first, then newest active) in one
+// shot instead of erroring and forcing the LLM to call find_containers and
+// re-issue. The write path stays strict — these tests lock both halves in.
+
+// --- resolver unit tests: read mode (resolveContainerRefRead) ---
+
+func TestResolveRead_AllStopped_PicksMostRecentlyActiveCorpse(t *testing.T) {
+	// The crash-loop case: every "svc.1.*" task has exited. There is no live
+	// container, but a read-only inspect/logs call still has an obvious target —
+	// the corpse that died most recently, which is what the investigation is
+	// about. Resolve to it rather than erroring.
+	t0 := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
+	deps := resolverDeps([]container.Container{
+		{ID: "old", Name: "svc.1.aaa", Host: "local", State: "exited", StartedAt: t0, FinishedAt: t0.Add(1 * time.Minute)},
+		{ID: "newest", Name: "svc.1.bbb", Host: "local", State: "exited", StartedAt: t0.Add(5 * time.Minute), FinishedAt: t0.Add(6 * time.Minute)},
+		{ID: "mid", Name: "svc.1.ccc", Host: "local", State: "exited", StartedAt: t0.Add(2 * time.Minute), FinishedAt: t0.Add(3 * time.Minute)},
+	})
+
+	host, id, note, err := resolveContainerRefRead("svc.1", "", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "local", host)
+	assert.Equal(t, "newest", id)
+	// The pick is transparent: the note names the chosen task and that siblings
+	// exist, so the model can re-call with an explicit id if it wants another.
+	assert.NotEmpty(t, note)
+	assert.Contains(t, note, "svc.1.bbb")
+}
+
+func TestResolveRead_MultipleRunning_PicksNewestRunningWithNote(t *testing.T) {
+	// Several live replicas of one service. For a read (logs/inspect) the
+	// replicas are interchangeable, so resolve to the newest running one and say
+	// so — never force a round-trip.
+	t0 := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
+	deps := resolverDeps([]container.Container{
+		{ID: "r-old", Name: "svc.1.aaa", Host: "local", State: "running", StartedAt: t0},
+		{ID: "r-new", Name: "svc.1.bbb", Host: "local", State: "running", StartedAt: t0.Add(10 * time.Minute)},
+		{ID: "dead", Name: "svc.1.ccc", Host: "local", State: "exited", StartedAt: t0.Add(20 * time.Minute)},
+	})
+
+	host, id, note, err := resolveContainerRefRead("svc.1", "", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "local", host)
+	// Running beats a newer-but-dead container; among running, newest StartedAt.
+	assert.Equal(t, "r-new", id)
+	assert.NotEmpty(t, note)
+	assert.Contains(t, note, "svc.1.bbb")
+	assert.Contains(t, note, "running")
+}
+
+func TestResolveRead_SingleRunning_NoNote(t *testing.T) {
+	// Exactly one running among corpses — same outcome as the write path, and
+	// since the choice is unambiguous there is nothing to disclose: no note.
+	deps := resolverDeps([]container.Container{
+		{ID: "dead", Name: "svc.1.aaa", Host: "local", State: "exited"},
+		{ID: "live", Name: "svc.1.bbb", Host: "local", State: "running"},
+	})
+
+	host, id, note, err := resolveContainerRefRead("svc.1", "", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "local", host)
+	assert.Equal(t, "live", id)
+	assert.Empty(t, note)
+}
+
+func TestResolveRead_UniqueName_NoNote(t *testing.T) {
+	// A plain unique match resolves exactly as before with no note — the read
+	// relaxation only adds behavior to the previously-erroring ambiguous case.
+	deps := resolverDeps([]container.Container{
+		{ID: "abc123def456", Name: "nginx", Host: "local", State: "running"},
+		{ID: "fff999", Name: "redis", Host: "local", State: "running"},
+	})
+
+	host, id, note, err := resolveContainerRefRead("nginx", "", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "local", host)
+	assert.Equal(t, "abc123def456", id)
+	assert.Empty(t, note)
+}
+
+func TestResolveRead_AmbiguousAcrossHosts_PicksNewestRunning(t *testing.T) {
+	// Same name on two hosts. The write path refuses (host_id needed). The read
+	// path is allowed to pick the newest running one across hosts — reads are
+	// safe and the note discloses the cross-host siblings.
+	t0 := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
+	deps := resolverDeps([]container.Container{
+		{ID: "a", Name: "nginx", Host: "host-a", State: "running", StartedAt: t0},
+		{ID: "b", Name: "nginx", Host: "host-b", State: "running", StartedAt: t0.Add(1 * time.Minute)},
+	})
+
+	host, id, note, err := resolveContainerRefRead("nginx", "", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "host-b", host)
+	assert.Equal(t, "b", id)
+	assert.NotEmpty(t, note)
+}
+
+func TestResolveRead_NotFound_StillErrors(t *testing.T) {
+	// No match at all is still an error — the relaxation never invents a target.
+	deps := resolverDeps([]container.Container{
+		{ID: "id1", Name: "nginx", Host: "local"},
+	})
+
+	_, _, _, err := resolveContainerRefRead("does-not-exist", "", deps)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "no container matching")
+}
+
+func TestResolveRead_HostScoped_StillRespectsHost(t *testing.T) {
+	// An explicit host still scopes the search even on the read path.
+	deps := resolverDeps([]container.Container{
+		{ID: "id1", Name: "nginx", Host: "host-a", State: "running"},
+		{ID: "id2", Name: "nginx", Host: "host-b", State: "running"},
+	})
+
+	host, id, _, err := resolveContainerRefRead("nginx", "host-b", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "host-b", host)
+	assert.Equal(t, "id2", id)
+}
+
+// --- write path stays strict: the relaxation must NOT leak into writes ---
+
+func TestResolveWrite_AllStopped_StillRefuses(t *testing.T) {
+	// The read path now resolves this; the write path must still refuse, because
+	// picking a destructive target among corpses is exactly the ambiguity write
+	// tools must never guess at.
+	deps := resolverDeps([]container.Container{
+		{ID: "s1", Name: "svc.1.aaa", Host: "local", State: "exited"},
+		{ID: "s2", Name: "svc.1.bbb", Host: "local", State: "exited"},
+	})
+
+	_, _, err := resolveContainerRef("svc.1", "", deps)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "matches multiple containers")
+}
+
+func TestResolveWrite_MultipleRunning_StillRefuses(t *testing.T) {
+	deps := resolverDeps([]container.Container{
+		{ID: "r1", Name: "svc.1.aaa", Host: "local", State: "running"},
+		{ID: "r2", Name: "svc.1.bbb", Host: "local", State: "running"},
+	})
+
+	_, _, err := resolveContainerRef("svc.1", "", deps)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "matches multiple containers")
+}
+
+// --- end-to-end through ExecuteTool: read tools resolve in one shot ---
+
+// closedLogsClientService is a MockClientService whose LogsBetweenDates returns
+// an already-closed channel, so executeFetchContainerLogs's range loop drains
+// immediately instead of blocking on the bare mock's nil channel. Embeds
+// MockClientService so it satisfies the full ClientService interface unchanged.
+type closedLogsClientService struct {
+	MockClientService
+}
+
+func (c *closedLogsClientService) LogsBetweenDates(_ context.Context, _ container.Container, _ time.Time, _ time.Time, _ container.StdType) (<-chan *container.LogEvent, error) {
+	ch := make(chan *container.LogEvent)
+	close(ch)
+	return ch, nil
+}
+
+func TestExecuteTool_FetchLogs_AllStopped_ResolvesNewestCorpseWithNote(t *testing.T) {
+	// fetch_container_logs on a crash-looped "svc.1" (every task exited) must
+	// succeed against the most-recently-active corpse and surface the pick in
+	// the result, not fail and force a find_containers round-trip.
+	t0 := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
+	mockHost := &MockHostService{}
+	withResolver(mockHost,
+		container.Container{ID: "old", Name: "svc.1.aaa", Host: "local", State: "exited", StartedAt: t0},
+		container.Container{ID: "newest", Name: "svc.1.bbb", Host: "local", State: "exited", StartedAt: t0.Add(5 * time.Minute)},
+	)
+	newestC := container.Container{ID: "newest", Name: "svc.1.bbb", Host: "local", State: "exited", StartedAt: t0.Add(5 * time.Minute)}
+	cs := container_support.NewContainerService(&closedLogsClientService{}, newestC)
+	mockHost.On("FindContainer", "local", "newest", container.ContainerLabels(nil)).Return(cs, nil)
+
+	resp := ExecuteTool(context.Background(), "fetch_container_logs", `{"container_id":"svc.1"}`, ToolDeps{HostService: mockHost})
+	assert.True(t, resp.Success)
+	// The returned container_name discloses the resolution so the model gets the
+	// sibling context for free in the same turn.
+	name := resp.GetFetchLogs().ContainerName
+	assert.Contains(t, name, "svc.1.bbb")
+	assert.Contains(t, name, "svc.1") // mentions the ambiguous ref it resolved
+	mockHost.AssertCalled(t, "FindContainer", "local", "newest", container.ContainerLabels(nil))
+}
+
+func TestExecuteTool_InspectContainer_AllStopped_ResolvesNewestCorpse(t *testing.T) {
+	// inspect_container on the same crash-loop resolves to the newest corpse in
+	// one shot. Inspect returns the concrete id/name/state (itself
+	// disambiguating), so we don't mangle its structured fields with a note.
+	t0 := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
+	mockHost := &MockHostService{}
+	withResolver(mockHost,
+		container.Container{ID: "old", Name: "svc.1.aaa", Host: "local", State: "exited", StartedAt: t0},
+		container.Container{ID: "newest", Name: "svc.1.bbb", Host: "local", State: "exited", StartedAt: t0.Add(5 * time.Minute)},
+	)
+	newestC := container.Container{ID: "newest", Name: "svc.1.bbb", Host: "local", State: "exited", StartedAt: t0.Add(5 * time.Minute)}
+	cs := container_support.NewContainerService(&MockClientService{}, newestC)
+	mockHost.On("FindContainer", "local", "newest", container.ContainerLabels(nil)).Return(cs, nil)
+
+	resp := ExecuteTool(context.Background(), "inspect_container", `{"container_id":"svc.1"}`, ToolDeps{HostService: mockHost})
+	assert.True(t, resp.Success)
+	assert.Equal(t, "newest", resp.GetInspectContainer().Id)
+	assert.Equal(t, "svc.1.bbb", resp.GetInspectContainer().Name)
+}
+
+func TestExecuteTool_RestartContainer_AllStopped_StillRefuses(t *testing.T) {
+	// The destructive counterpart of the fetch_logs test above: the SAME
+	// ambiguous all-stopped ref that reads now resolve must still refuse for a
+	// write tool, and must never reach FindContainer / ContainerAction.
+	mockClient := &MockClientService{}
+	mockHost := &MockHostService{}
+	withResolver(mockHost,
+		container.Container{ID: "s1", Name: "svc.1.aaa", Host: "local", State: "exited"},
+		container.Container{ID: "s2", Name: "svc.1.bbb", Host: "local", State: "exited"},
+	)
+
+	resp := ExecuteTool(context.Background(), "restart_container", `{"container_id":"svc.1"}`, ToolDeps{HostService: mockHost, EnableActions: true})
+	assert.False(t, resp.Success)
+	assert.Contains(t, resp.Error, "matches multiple containers")
+	mockHost.AssertNotCalled(t, "FindContainer", mock.Anything, mock.Anything, mock.Anything)
+	mockClient.AssertNotCalled(t, "ContainerAction", mock.Anything, mock.Anything, mock.Anything)
+}
@@ -135,6 +135,52 @@ func TestResolveContainerRef_AmbiguousSubstring(t *testing.T) {
 	assert.NotContains(t, err.Error(), "host_id")
 }

+func TestResolveContainerRef_PrefersRunningAmongStoppedCorpses(t *testing.T) {
+	// The motivating case: Docker Swarm leaves stopped task containers behind
+	// across redeploys, so the short task name "svc.1" substring-matches every
+	// historical task — but only the current one is running. The single live
+	// container is the unambiguous referent; resolve to it instead of erroring.
+	deps := resolverDeps([]container.Container{
+		{ID: "old1", Name: "svc.1.aaa", Host: "local", State: "exited"},
+		{ID: "old2", Name: "svc.1.bbb", Host: "local", State: "exited"},
+		{ID: "live", Name: "svc.1.ccc", Host: "local", State: "running"},
+		{ID: "old3", Name: "svc.1.ddd", Host: "local", State: "exited"},
+	})
+
+	host, id, err := resolveContainerRef("svc.1", "", deps)
+	assert.NoError(t, err)
+	assert.Equal(t, "local", host)
+	assert.Equal(t, "live", id)
+}
+
+func TestResolveContainerRef_MultipleRunningStillAmbiguous(t *testing.T) {
+	// Two genuinely live replicas — we must NOT guess, and the listing must
+	// surface each candidate's state so the caller can pick the right one.
+	deps := resolverDeps([]container.Container{
+		{ID: "r1", Name: "svc.1.aaa", Host: "host-a", State: "running"},
+		{ID: "r2", Name: "svc.2.bbb", Host: "host-b", State: "running"},
+	})
+
+	_, _, err := resolveContainerRef("svc", "", deps)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "matches multiple containers")
+	assert.Contains(t, err.Error(), "running")
+}
+
+func TestResolveContainerRef_AllStoppedStillAmbiguous(t *testing.T) {
+	// No live container to prefer — refuse and list the candidates with their
+	// state rather than picking an arbitrary corpse.
+	deps := resolverDeps([]container.Container{
+		{ID: "s1", Name: "svc.1.aaa", Host: "local", State: "exited"},
+		{ID: "s2", Name: "svc.1.bbb", Host: "local", State: "exited"},
+	})
+
+	_, _, err := resolveContainerRef("svc.1", "", deps)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "matches multiple containers")
+	assert.Contains(t, err.Error(), "exited")
+}
+
 func TestResolveContainerRef_HostDisambiguates(t *testing.T) {
 	// Same name on two hosts; supplying the host id resolves cleanly.
 	deps := resolverDeps([]container.Container{
@@ -272,3 +318,23 @@ func TestExecuteTool_RestartContainer_ByName_HostInferred(t *testing.T) {
 	assert.Equal(t, "id1", resp.GetAction().ContainerId)
 	mockClient.AssertCalled(t, "ContainerAction", mock.Anything, mock.Anything, container.Restart)
 }
+
+func TestExecuteTool_RestartContainer_PrefersRunningReplica(t *testing.T) {
+	// A name matching one running task and several stopped corpses resolves to
+	// the live task even for a write tool — the corpses are never a valid target,
+	// so picking the single running container is safe (and what the user means).
+	mockClient := &MockClientService{}
+	mockClient.On("ContainerAction", mock.Anything, mock.Anything, container.Restart).Return(nil)
+	cs := container_support.NewContainerService(mockClient, container.Container{ID: "live", Name: "svc.1.ccc", Host: "local", State: "running"})
+
+	mockHost := &MockHostService{}
+	withResolver(mockHost,
+		container.Container{ID: "dead", Name: "svc.1.aaa", Host: "local", State: "exited"},
+		container.Container{ID: "live", Name: "svc.1.ccc", Host: "local", State: "running"},
+	)
+	mockHost.On("FindContainer", "local", "live", container.ContainerLabels(nil)).Return(cs, nil)
+
+	resp := ExecuteTool(context.Background(), "restart_container", `{"container_id":"svc.1"}`, ToolDeps{HostService: mockHost, EnableActions: true})
+	assert.True(t, resp.Success)
+	assert.Equal(t, "live", resp.GetAction().ContainerId)
+}
@@ -68,7 +68,11 @@ func executeStreamLogs(ctx context.Context, requestID string, argsJSON string, d
 		return err
 	}

-	hostID, containerID, err := resolveContainerRef(args.ContainerID, args.Host, deps)
+	// Read-only: resolve an ambiguous name in one shot instead of erroring. note
+	// is non-empty when the name resolved to one of several candidates; it is
+	// surfaced once, on the first emitted batch, so the model learns the pick and
+	// its siblings without a round-trip and without repeating on every batch.
+	hostID, containerID, note, err := resolveContainerRefRead(args.ContainerID, args.Host, deps)
 	if err != nil {
 		return err
 	}
@@ -87,10 +91,16 @@ func executeStreamLogs(ctx context.Context, requestID string, argsJSON string, d
 		}
 	}()

+	noteSent := false
 	sendBatch := func(entries []*pb.LogEntry, endStream bool) error {
 		if len(entries) == 0 && !endStream {
 			return nil
 		}
+		name := cs.Container.Name
+		if !noteSent {
+			name = withResolutionNote(name, note)
+			noteSent = true
+		}
 		resp := &pb.ToolResponse{
 			RequestId: requestID,
 			Type: &pb.ToolResponse_CallTool{
@@ -100,7 +110,7 @@ func executeStreamLogs(ctx context.Context, requestID string, argsJSON string, d
 					EndStream: endStream,
 					Result: &pb.CallToolResponse_FetchLogs{
 						FetchLogs: &pb.FetchLogsResult{
-							ContainerName: cs.Container.Name,
+							ContainerName: name,
 							Entries:       entries,
 						},
 					},
@@ -162,4 +172,3 @@ func executeStreamLogs(ctx context.Context, requestID string, argsJSON string, d
 		}
 	}
 }
-