From 87bc48338d81fb89acb4861c9ea7ae59145e3318 Mon Sep 17 00:00:00 2001 From: Amir Raminfar Date: Mon, 1 Jun 2026 15:56:00 -0700 Subject: [PATCH] fix: rank log level guesses by confidence (#4772) Co-authored-by: Claude Opus 4.8 (1M context) --- internal/container/level_guesser.go | 96 ++++++++++++++++-------- internal/container/level_guesser_test.go | 11 +++ 2 files changed, 76 insertions(+), 31 deletions(-) diff --git a/internal/container/level_guesser.go b/internal/container/level_guesser.go index 2dcdc9ed..c4609415 100644 --- a/internal/container/level_guesser.go +++ b/internal/container/level_guesser.go @@ -2,6 +2,7 @@ package container import ( "regexp" + "sort" "strings" "github.com/rs/zerolog/log" @@ -23,20 +24,30 @@ var logLevels = [][]string{ // aliasToCanonical maps every alias to its canonical level name. var aliasToCanonical = map[string]string{} -// levelRegexes holds one combined regex per canonical level. Each regex is an -// alternation of all the shapes a level can take in a log line: +// levelMatcher extracts a canonical log level from a line. re must expose a +// single capture group holding the level alias, or a single-letter code when +// single is true (mapped via singleLetterToLevel). +type levelMatcher struct { + re *regexp.Regexp + single bool +} + +// levelTiers groups matchers by how confidently their shape identifies the log +// level, highest confidence first: // -// (?i:^[^a-z] // plain prefix: "error: ..." -// |\[ ? ?\] // bracketed: "[ERROR]" / "[ error ]" -// | [/|:-] // separator: " error|", " info:" (z2m) -// |:\s) // colon prefix: "Tag:info " (z2m) -// |"" // quoted: "\"ERROR\"" -// |\s\s // spaced: " ERROR " +// 1. ^ start-of-line prefix: "ERROR: ...", "INF ..." +// 2. [] bracketed tag / single-letter: "[ERROR]", "[E]" +// 3. : structured prefix: "Zigbee2MQTT:info " +// 4. "" quoted upper-case value: LL="ERROR" +// 5. [/|:-] separator: " error:", " info|" +// 6. bare upper-case token mid-line: "123 ERROR foo" // -// The case-insensitive group covers the boundary-anchored forms; the trailing -// uppercase-only branches catch mid-line `ERROR` tokens without false-firing on -// the word "error" in prose. -var levelRegexes = map[string]*regexp.Regexp{} +// guessFromString walks the tiers in order and stops at the first that matches, +// so a real level prefix at the front of the line always beats a level word +// buried in the message body. Within a tier, two different levels mean the line +// is ambiguous and we return "unknown" rather than guess. Match position and +// level severity are deliberately not used as tie-breakers. +var levelTiers [][]levelMatcher // singleLetterBracket matches single-letter levels in brackets, e.g. [I], [E], [W] var singleLetterBracket = regexp.MustCompile(`\[([EWIDFTV])\]`) @@ -48,26 +59,33 @@ var levelKeys = []string{"@l", "level", "log.level", "severity"} func init() { SupportedLogLevels = make(map[string]struct{}, len(logLevels)+1) + var aliases []string for _, group := range logLevels { canonical := group[0] SupportedLogLevels[canonical] = struct{}{} for _, alias := range group { aliasToCanonical[alias] = canonical + aliases = append(aliases, alias) } - - alt := "(?:" + strings.Join(group, "|") + ")" - upper := strings.ToUpper(alt) - - levelRegexes[canonical] = regexp.MustCompile( - `(?i:^` + alt + `[^a-z]` + - `|\[ ?` + alt + ` ?\]` + - `| ` + alt + `[/|:-]` + - `|:` + alt + `\s)` + - `|"` + upper + `"` + - `|\s` + upper + `\s`, - ) } SupportedLogLevels["unknown"] = struct{}{} + + // Longest aliases first so e.g. "warning" is preferred over "warn". + sort.SliceStable(aliases, func(i, j int) bool { return len(aliases[i]) > len(aliases[j]) }) + joined := strings.Join(aliases, "|") + upper := strings.ToUpper(joined) + + levelTiers = [][]levelMatcher{ + {{re: regexp.MustCompile(`(?i)^(` + joined + `)[^a-z]`)}}, + { + {re: regexp.MustCompile(`(?i)\[ ?(` + joined + `) ?\]`)}, + {re: singleLetterBracket, single: true}, + }, + {{re: regexp.MustCompile(`(?i):(` + joined + `)\s`)}}, + {{re: regexp.MustCompile(`"(` + upper + `)"`)}}, + {{re: regexp.MustCompile(`(?i) (` + joined + `)[/|:-]`)}}, + {{re: regexp.MustCompile(`\s(` + upper + `)\s`)}}, + } } func guessLogLevel(logEvent *LogEvent) string { @@ -117,14 +135,30 @@ var singleLetterToLevel = map[byte]string{ func guessFromString(value string) string { value = StripANSI(value) value = timestampRegex.ReplaceAllString(value, "") - for _, group := range logLevels { - if levelRegexes[group[0]].MatchString(value) { - return group[0] + for _, tier := range levelTiers { + level := "" + for _, m := range tier { + for _, match := range m.re.FindAllStringSubmatch(value, -1) { + var canonical string + if m.single { + canonical = singleLetterToLevel[match[1][0]] + } else { + canonical = aliasToCanonical[strings.ToLower(match[1])] + } + if canonical == "" { + continue + } + if level == "" { + level = canonical + } else if level != canonical { + // Two different levels at the same confidence: ambiguous. + return "unknown" + } + } + } + if level != "" { + return level } - } - - if m := singleLetterBracket.FindStringSubmatch(value); m != nil { - return singleLetterToLevel[m[1][0]] } return "unknown" } diff --git a/internal/container/level_guesser_test.go b/internal/container/level_guesser_test.go index b0b406f2..b629111d 100644 --- a/internal/container/level_guesser_test.go +++ b/internal/container/level_guesser_test.go @@ -107,6 +107,17 @@ func TestGuessLogLevel(t *testing.T) { {"[T] trace message", "trace"}, {"[V] verbose message", "trace"}, {"12:00:00 [I] starting up", "info"}, + // Issue #4768: a real level prefix must win over a level word in the message body. + {"INFO: connection established, retrying after error: timeout", "info"}, + {"INFO handling request failed with error: bad gateway", "info"}, + {"2024-12-30T17:43:16Z INF some message about an error: foo", "info"}, + {"INFO request completed but contained ERROR token", "info"}, + {"WARN: connection error: retrying", "warn"}, + // Symmetric: an ERROR prefix still wins over a later info word. + {"ERROR: handler failed, info: will retry", "error"}, + // Equal confidence between two different levels -> unknown (don't guess). + {"saw info: here and error: there", "unknown"}, + {"[INFO] [DEBUG] both bracketed", "unknown"}, {orderedmap.New[string, any]( orderedmap.WithInitialData( orderedmap.Pair[string, any]{Key: "key", Value: "value"},