Files
cocos/pkg/oci/extract_test.go
Sammy Kerata Oina b44780df95
CI / lint (push) Has been cancelled
CI / test (agent) (push) Has been cancelled
CI / test (cli) (push) Has been cancelled
CI / test (cmd) (push) Has been cancelled
CI / test (internal) (push) Has been cancelled
CI / test (manager, true) (push) Has been cancelled
CI / test (pkg) (push) Has been cancelled
CI / upload-coverage (push) Has been cancelled
NOISSUE - Enhance OCI image extraction to return algorithm and requirements paths, and add deferred cleanup for temporary files (#586)
* feat: Enhance OCI image extraction to return algorithm and requirements paths, and add deferred cleanup for temporary files.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* feat: implement deterministic zipping and enhance checksum verification for resources

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* feat: Update component build sources, add gRPC health checks to the CVM server, and refine algorithm argument handling and documentation.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* docs: Update remote resources testing guide with `sudo` for KBS, algorithm result saving, `requirements.txt`, and `algo-args` for RVPS.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* refactor: Explicitly ignore `stderr.Write` return values and add minor whitespace in tests.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* test: add comprehensive error path and edge case tests for file, zip, OCI, and agent components.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* feat: Add mutexes for thread-safe algorithm execution and expand recognized data file extensions to include common archive formats.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* feat: Add OCI extraction tests for Python algorithms and multi-layer datasets, refactor algorithm execution for testability, and enhance algorithm stop and error handling tests.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* test: Add error assertions to OCI extraction test helpers and remove an unused mock exec command.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* test: Improve error handling test coverage for algorithm execution and OCI resource extraction.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

* fix: Improve algorithm process termination, enhance computation error handling, and add concurrency safety to agent service.

Signed-off-by: Sammy Oina <sammyoina@gmail.com>

---------

Signed-off-by: Sammy Oina <sammyoina@gmail.com>
2026-03-27 14:23:52 +01:00

1138 lines
33 KiB
Go

// Copyright (c) Ultraviolet
// SPDX-License-Identifier: Apache-2.0
package oci
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"encoding/json"
"log/slog"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const testPythonScript = "print('hello')"
func TestIsAlgorithmFile(t *testing.T) {
tests := []struct {
name string
filename string
mode int64
algoType string
want bool
}{
{"Python file", "algorithm.py", 0o644, "python", true},
{"WASM file", "module.wasm", 0o644, "wasm", true},
{"WAT file", "module.wat", 0o644, "wasm", true},
{"Python file as bin", "algorithm.py", 0o755, "bin", false},
{"Main python file", "main.py", 0o644, "python", true},
{"Binary file with common name", "algorithm", 0o644, "bin", true},
{"Binary file with common name run", "run", 0o644, "bin", true},
{"Executable binary", "my-app", 0o755, "bin", true},
{"CSV data file", "data.csv", 0o755, "python", false},
{"JSON config file", "config.json", 0o755, "wasm", false},
{"Text file", "readme.txt", 0o755, "bin", false},
{"Uppercase extension", "MAIN.PY", 0o644, "python", true},
{"Mixed case", "Algorithm.Py", 0o644, "python", true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := isAlgorithmFile(tt.filename, tt.mode, tt.algoType)
assert.Equal(t, tt.want, got)
})
}
}
func TestIsDataFile(t *testing.T) {
tests := []struct {
name string
filename string
want bool
}{
{"CSV file", "data.csv", true},
{"JSON file", "config.json", true},
{"Text file", "readme.txt", true},
{"Parquet file", "data.parquet", true},
{"Arrow file", "data.arrow", true},
{"DAT file", "data.dat", true},
{"Python file", "script.py", false},
{"WASM file", "module.wasm", false},
{"Binary file", "data.bin", false},
{"Uppercase CSV", "DATA.CSV", true},
{"Nested path", "data/input/dataset.csv", true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := isDataFile(tt.filename)
assert.Equal(t, tt.want, got)
})
}
}
func TestExtractAlgorithm(t *testing.T) {
logger := slog.Default()
t.Run("missing index.json", func(t *testing.T) {
tempDir := t.TempDir()
_, _, err := ExtractAlgorithm(context.Background(), logger, tempDir, t.TempDir(), "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to read index.json")
})
t.Run("invalid index.json", func(t *testing.T) {
tempDir := t.TempDir()
err := os.WriteFile(filepath.Join(tempDir, "index.json"), []byte("not json"), 0o644)
require.NoError(t, err)
_, _, err = ExtractAlgorithm(context.Background(), logger, tempDir, t.TempDir(), "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to parse index.json")
})
t.Run("empty manifests", func(t *testing.T) {
tempDir := t.TempDir()
index := OCIIndex{SchemaVersion: 2}
data, _ := json.Marshal(index)
err := os.WriteFile(filepath.Join(tempDir, "index.json"), data, 0o644)
require.NoError(t, err)
_, _, err = ExtractAlgorithm(context.Background(), logger, tempDir, t.TempDir(), "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no manifests found")
})
t.Run("successful extraction", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "algorithm.py", testPythonScript)
algoPath, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
require.NoError(t, err)
assert.NotEmpty(t, algoPath)
assert.Contains(t, algoPath, "algorithm.py")
})
}
func TestExtractDataset(t *testing.T) {
t.Run("missing index.json", func(t *testing.T) {
tempDir := t.TempDir()
_, err := ExtractDataset(tempDir, t.TempDir())
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to read index.json")
})
t.Run("successful extraction", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "data.csv", "col1,col2\n1,2")
files, err := ExtractDataset(ociDir, destDir)
require.NoError(t, err)
assert.NotEmpty(t, files)
})
}
func TestExtractDatasetWithPathTraversal(t *testing.T) {
t.Run("path traversal skipped, valid file extracted", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Path traversal entry (should be skipped)
maliciousHdr := &tar.Header{
Name: "../../../tmp/evil.csv",
Mode: 0o644,
Size: int64(len("evil")),
}
require.NoError(t, tw.WriteHeader(maliciousHdr))
_, err = tw.Write([]byte("evil"))
require.NoError(t, err)
// Valid CSV file
csvContent := "col1,col2\n1,2"
csvHdr := &tar.Header{
Name: "data.csv",
Mode: 0o644,
Size: int64(len(csvContent)),
}
require.NoError(t, tw.WriteHeader(csvHdr))
_, err = tw.Write([]byte(csvContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
files, err := ExtractDataset(ociDir, destDir)
require.NoError(t, err)
assert.Len(t, files, 1)
assert.Contains(t, files[0], "data.csv")
// Verify malicious file was NOT created outside destDir
_, err = os.Stat("/tmp/evil.csv")
assert.True(t, os.IsNotExist(err))
})
}
func TestExtractDatasetInvalidManifest(t *testing.T) {
t.Run("invalid manifest JSON", func(t *testing.T) {
ociDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), []byte("not json"), 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: 8}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, err := ExtractDataset(ociDir, t.TempDir())
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to parse manifest")
})
}
func TestExtractDatasetWithDirectory(t *testing.T) {
t.Run("layer with directory entries for dataset", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Directory entry
dirHdr := &tar.Header{
Name: "data/",
Mode: 0o755,
Typeflag: tar.TypeDir,
}
require.NoError(t, tw.WriteHeader(dirHdr))
// CSV inside directory
csvContent := "a,b\n1,2"
csvHdr := &tar.Header{
Name: "data/dataset.csv",
Mode: 0o644,
Size: int64(len(csvContent)),
}
require.NoError(t, tw.WriteHeader(csvHdr))
_, err = tw.Write([]byte(csvContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
files, err := ExtractDataset(ociDir, destDir)
require.NoError(t, err)
require.Len(t, files, 1)
assert.Contains(t, files[0], "dataset.csv")
})
}
func TestExtractDatasetMissingManifest(t *testing.T) {
t.Run("manifest file not found", func(t *testing.T) {
ociDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:nonexistent", Size: 0}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, err := ExtractDataset(ociDir, t.TempDir())
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to read manifest")
})
}
func TestOCILayoutStructure(t *testing.T) {
t.Run("OCILayout JSON serialization", func(t *testing.T) {
layout := OCILayout{ImageLayoutVersion: "1.0.0"}
data, err := json.Marshal(layout)
require.NoError(t, err)
var decoded OCILayout
err = json.Unmarshal(data, &decoded)
require.NoError(t, err)
assert.Equal(t, layout.ImageLayoutVersion, decoded.ImageLayoutVersion)
})
}
func setupTestOCIImage(t *testing.T, filename, content string) (ociDir, destDir string) {
t.Helper()
ociDir = t.TempDir()
destDir = t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
hdr := &tar.Header{
Name: filename,
Mode: 0o644,
Size: int64(len(content)),
}
require.NoError(t, tw.WriteHeader(hdr))
_, err = tw.Write([]byte(content))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, err := json.Marshal(manifest)
require.NoError(t, err)
manifestPath := filepath.Join(blobsDir, "manifest123")
require.NoError(t, os.WriteFile(manifestPath, manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{
MediaType: "application/vnd.oci.image.manifest.v1+json",
Digest: "sha256:manifest123",
Size: len(manifestData),
}},
}
indexData, err := json.Marshal(index)
require.NoError(t, err)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
return ociDir, destDir
}
func TestExtractAlgorithmWithRequirements(t *testing.T) {
logger := slog.Default()
t.Run("extract algorithm with requirements.txt", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Add algorithm file
algoContent := testPythonScript
algoHdr := &tar.Header{
Name: "main.py",
Mode: 0o644,
Size: int64(len(algoContent)),
}
require.NoError(t, tw.WriteHeader(algoHdr))
_, err = tw.Write([]byte(algoContent))
require.NoError(t, err)
// Add requirements.txt
reqContent := "numpy==1.21.0\npandas==1.3.0"
reqHdr := &tar.Header{
Name: "requirements.txt",
Mode: 0o644,
Size: int64(len(reqContent)),
}
require.NoError(t, tw.WriteHeader(reqHdr))
_, err = tw.Write([]byte(reqContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
// Create manifest and index
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, err := json.Marshal(manifest)
require.NoError(t, err)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, err := json.Marshal(index)
require.NoError(t, err)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
algoPath, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
require.NoError(t, err)
assert.Contains(t, algoPath, "main.py")
// Verify requirements.txt was also extracted
reqPath := filepath.Join(destDir, "requirements.txt")
_, err = os.Stat(reqPath)
assert.NoError(t, err)
})
}
func TestExtractAlgorithmNoAlgoFile(t *testing.T) {
logger := slog.Default()
t.Run("no algorithm file in layers", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Add a non-algorithm file (e.g., just a readme)
readmeContent := "This is a readme"
readmeHdr := &tar.Header{
Name: "README.md",
Mode: 0o644,
Size: int64(len(readmeContent)),
}
require.NoError(t, tw.WriteHeader(readmeHdr))
_, err = tw.Write([]byte(readmeContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, _, err = ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no algorithm file found")
})
}
func TestExtractDatasetNoDataFiles(t *testing.T) {
t.Run("no data files in layers", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Add a python file (not a data file)
pyContent := testPythonScript
pyHdr := &tar.Header{
Name: "script.py",
Mode: 0o644,
Size: int64(len(pyContent)),
}
require.NoError(t, tw.WriteHeader(pyHdr))
_, err = tw.Write([]byte(pyContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, err = ExtractDataset(ociDir, destDir)
assert.Error(t, err)
assert.Contains(t, err.Error(), "no dataset files found")
})
t.Run("corrupt layer file", func(t *testing.T) {
ociDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "layer123"), []byte("not a gzip"), 0o644))
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
// ExtractDataset logs a warning and continues if a layer fails, but if ALL fail it errors
_, err := ExtractDataset(ociDir, t.TempDir())
assert.Error(t, err)
})
}
func TestExtractAlgorithmInvalidManifest(t *testing.T) {
logger := slog.Default()
t.Run("invalid manifest JSON", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
// Write invalid manifest
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), []byte("not json"), 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: 8}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to parse manifest")
})
}
func TestExtractAlgorithmMissingManifest(t *testing.T) {
logger := slog.Default()
t.Run("manifest file not found", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
// Don't create manifest file
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:missing123", Size: 8}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "failed to read manifest")
})
}
func TestExtractAlgorithmWithDirectory(t *testing.T) {
logger := slog.Default()
t.Run("layer with directory entries", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Add a directory entry
dirHdr := &tar.Header{
Name: "src/",
Mode: 0o755,
Typeflag: tar.TypeDir,
}
require.NoError(t, tw.WriteHeader(dirHdr))
// Add algorithm file in subdirectory
algoContent := testPythonScript
algoHdr := &tar.Header{
Name: "src/main.py",
Mode: 0o644,
Size: int64(len(algoContent)),
}
require.NoError(t, tw.WriteHeader(algoHdr))
_, err = tw.Write([]byte(algoContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
algoPath, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
require.NoError(t, err)
assert.Contains(t, algoPath, "main.py")
})
}
func TestExtractAlgorithmPathTraversal(t *testing.T) {
logger := slog.Default()
t.Run("path traversal attempt", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
layerPath := filepath.Join(blobsDir, "layer123")
layerFile, err := os.Create(layerPath)
require.NoError(t, err)
gw := gzip.NewWriter(layerFile)
tw := tar.NewWriter(gw)
// Add a file with path traversal attempt
maliciousContent := "malicious"
maliciousHdr := &tar.Header{
Name: "../../../etc/malicious.py",
Mode: 0o644,
Size: int64(len(maliciousContent)),
}
require.NoError(t, tw.WriteHeader(maliciousHdr))
_, err = tw.Write([]byte(maliciousContent))
require.NoError(t, err)
// Add a legit file
algoContent := testPythonScript
algoHdr := &tar.Header{
Name: "algorithm.py",
Mode: 0o644,
Size: int64(len(algoContent)),
}
require.NoError(t, tw.WriteHeader(algoHdr))
_, err = tw.Write([]byte(algoContent))
require.NoError(t, err)
require.NoError(t, tw.Close())
require.NoError(t, gw.Close())
require.NoError(t, layerFile.Close())
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:layer123"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
algoPath, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
require.NoError(t, err)
assert.Contains(t, algoPath, "algorithm.py")
// Verify malicious file was NOT extracted outside destDir
_, err = os.Stat("/etc/malicious.py")
assert.True(t, os.IsNotExist(err))
})
}
func TestExtractAlgorithmErrorPathsAdditional(t *testing.T) {
logger := slog.Default()
t.Run("invalid layer gzip", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "main.py", "print('hello')")
// Corrupt the layer file
layerPath := filepath.Join(ociDir, "blobs", "sha256", "layer123")
err := os.WriteFile(layerPath, []byte("not gzip"), 0o644)
require.NoError(t, err)
_, _, err = ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no algorithm file found")
})
t.Run("invalid tar formatting", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "main.py", "print('hello')")
layerPath := filepath.Join(ociDir, "blobs", "sha256", "layer123")
// Create a valid gzip but invalid tar
var buf bytes.Buffer
gw := gzip.NewWriter(&buf)
_, err := gw.Write([]byte("not a tar archive but it is gzipped"))
require.NoError(t, err)
gw.Close()
err = os.WriteFile(layerPath, buf.Bytes(), 0o644)
require.NoError(t, err)
_, _, err = ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no algorithm file found")
})
t.Run("non-existent layer file", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:nonexistent"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no algorithm file found")
})
}
func TestExtractDatasetErrorPathsAdditional(t *testing.T) {
t.Run("invalid layer gzip", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "data.csv", "a,b,c")
layerPath := filepath.Join(ociDir, "blobs", "sha256", "layer123")
err := os.WriteFile(layerPath, []byte("not gzip"), 0o644)
require.NoError(t, err)
_, err = ExtractDataset(ociDir, destDir)
assert.Error(t, err)
})
t.Run("non-existent layer file", func(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:nonexistent"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "manifest123"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:manifest123", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, err := ExtractDataset(ociDir, destDir)
assert.Error(t, err)
assert.Contains(t, err.Error(), "no dataset files found")
})
}
func TestExtractAlgorithmAdditionalTypes(t *testing.T) {
t.Run("isAlgorithmFile additional types", func(t *testing.T) {
assert.False(t, isAlgorithmFile("any", 0o644, "docker"))
assert.False(t, isAlgorithmFile("any", 0o644, "unknown"))
})
}
func TestExtractAlgorithmErrorPathsInternal(t *testing.T) {
logger := slog.Default()
t.Run("failed to create directory", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "algorithm.py", "print('hello')")
// Create a file where a directory should be
blockedDir := filepath.Join(destDir, "blocked")
require.NoError(t, os.WriteFile(blockedDir, []byte("data"), 0o644))
// Try to extract an algorithm that would need to create a directory where a file exists
layerPath := filepath.Join(ociDir, "blobs", "sha256", "layer123")
var buf bytes.Buffer
gw := gzip.NewWriter(&buf)
tw := tar.NewWriter(gw)
hdr := &tar.Header{
Name: "blocked/main.py",
Mode: 0o644,
Size: int64(len("print(1)")),
}
require.NoError(t, tw.WriteHeader(hdr))
_, _ = tw.Write([]byte("print(1)"))
tw.Close()
gw.Close()
require.NoError(t, os.WriteFile(layerPath, buf.Bytes(), 0o644))
_, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
assert.Error(t, err)
})
t.Run("failed to create file", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "algorithm.py", "print('hello')")
// Create a directory where a file should be
blockedFile := filepath.Join(destDir, "algorithm.py")
require.NoError(t, os.MkdirAll(blockedFile, 0o755))
_, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
assert.Error(t, err)
})
}
func TestExtractDatasetErrorPathsInternal(t *testing.T) {
t.Run("failed to create directory for dataset", func(t *testing.T) {
ociDir, destDir := setupTestOCIImage(t, "data.csv", "a,b,c")
blockedDir := filepath.Join(destDir, "blocked")
require.NoError(t, os.WriteFile(blockedDir, []byte("data"), 0o644))
layerPath := filepath.Join(ociDir, "blobs", "sha256", "layer123")
var buf bytes.Buffer
gw := gzip.NewWriter(&buf)
tw := tar.NewWriter(gw)
hdr := &tar.Header{
Name: "blocked/data.csv",
Mode: 0o644,
Size: int64(len("a,b")),
}
require.NoError(t, tw.WriteHeader(hdr))
_, _ = tw.Write([]byte("a,b"))
tw.Close()
gw.Close()
require.NoError(t, os.WriteFile(layerPath, buf.Bytes(), 0o644))
_, err := ExtractDataset(ociDir, destDir)
assert.Error(t, err)
})
}
func TestExtractAlgorithm_PythonNoRequirements(t *testing.T) {
logger := slog.Default()
ociDir, destDir := setupTestOCIImage(t, "main.py", testPythonScript)
algoPath, reqPath, err := ExtractAlgorithm(context.Background(), logger, ociDir, destDir, "python")
require.NoError(t, err)
assert.NotEmpty(t, algoPath)
assert.Empty(t, reqPath)
}
func TestExtractDataset_MultipleLayers(t *testing.T) {
ociDir := t.TempDir()
destDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
createLayer := func(name, filename, content string) string {
path := filepath.Join(blobsDir, name)
f, err := os.Create(path)
require.NoError(t, err)
gw := gzip.NewWriter(f)
tw := tar.NewWriter(gw)
hdr := &tar.Header{Name: filename, Mode: 0o644, Size: int64(len(content))}
err = tw.WriteHeader(hdr)
require.NoError(t, err)
_, err = tw.Write([]byte(content))
require.NoError(t, err)
err = tw.Close()
require.NoError(t, err)
err = gw.Close()
require.NoError(t, err)
err = f.Close()
require.NoError(t, err)
return "sha256:" + name
}
layer1 := createLayer("l1", "data1.csv", "1,2")
layer2 := createLayer("l2", "data2.csv", "3,4")
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: layer1}, {Digest: layer2}},
}
manifestData, err := json.Marshal(manifest)
require.NoError(t, err)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "m1"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:m1", Size: len(manifestData)}},
}
indexData, err := json.Marshal(index)
require.NoError(t, err)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
files, err := ExtractDataset(ociDir, destDir)
require.NoError(t, err)
assert.Len(t, files, 2)
}
func TestExtractAlgorithm_ErrorPaths(t *testing.T) {
logger := slog.Default()
t.Run("invalid layer gzip", func(t *testing.T) {
ociDir := t.TempDir()
blobsDir := filepath.Join(ociDir, "blobs", "sha256")
require.NoError(t, os.MkdirAll(blobsDir, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "l1"), []byte("not gzip"), 0o644))
manifest := struct {
Layers []struct {
Digest string `json:"digest"`
} `json:"layers"`
}{
Layers: []struct {
Digest string `json:"digest"`
}{{Digest: "sha256:l1"}},
}
manifestData, _ := json.Marshal(manifest)
require.NoError(t, os.WriteFile(filepath.Join(blobsDir, "m1"), manifestData, 0o644))
index := OCIIndex{
SchemaVersion: 2,
Manifests: []struct {
MediaType string `json:"mediaType"`
Digest string `json:"digest"`
Size int `json:"size"`
}{{Digest: "sha256:m1", Size: len(manifestData)}},
}
indexData, _ := json.Marshal(index)
require.NoError(t, os.WriteFile(filepath.Join(ociDir, "index.json"), indexData, 0o644))
_, _, err := ExtractAlgorithm(context.Background(), logger, ociDir, t.TempDir(), "bin")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no algorithm file found")
})
}