Fix flaky EC integration tests by collecting server logs on failure (#7969)

* Fix flaky EC integration tests by collecting server logs on failure

The EC Integration Tests were experiencing flaky timeouts with errors like
"error reading from server: EOF" and master client reconnection attempts.
When tests failed, server logs were not collected, making debugging difficult.

Changes:
- Updated all test functions to use t.TempDir() instead of os.MkdirTemp()
  and manual cleanup. t.TempDir() automatically preserves directories when
  tests fail, ensuring logs are available for debugging.
- Modified GitHub Actions workflow to collect server logs from temp
  directories when tests fail, including master.log and volume*.log files.
- Added explicit log collection step that searches for test temp directories
  and copies them to artifacts for upload.

This will make debugging flaky test failures much easier by providing access
to the actual server logs showing what went wrong.

* Fix find command precedence in log collection

The -type d flag only applied to the first -name predicate because -o
has lower precedence than the implicit AND. Grouped the -name predicates
with escaped parentheses so -type d applies to all directory name patterns.
This commit is contained in:
Chris Lu 2026-01-05 12:05:31 -08:00 committed by GitHub
parent fd1cac8123
commit 15ca301e43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 37 additions and 25 deletions

View File

@ -33,9 +33,29 @@ jobs:
run: |
go test -v
- name: Collect server logs on failure
if: failure()
run: |
echo "Collecting server logs from temp directories..."
mkdir -p /tmp/ec-test-logs
# Find all temp directories created by the tests (they persist on failure with t.TempDir())
find /tmp -maxdepth 1 -type d \( -name "TestEC*" -o -name "TestDisk*" -o -name "TestCross*" -o -name "TestEvacuation*" \) 2>/dev/null | while read dir; do
if [ -d "$dir" ]; then
echo "Found test directory: $dir"
# Copy the entire directory structure to preserve organization
cp -r "$dir" /tmp/ec-test-logs/ 2>/dev/null || true
fi
done
# List what we collected
echo "Collected logs:"
find /tmp/ec-test-logs -type f -name "*.log" 2>/dev/null || echo "No logs found"
- name: Archive logs
if: failure()
uses: actions/upload-artifact@v6
with:
name: ec-integration-test-logs
path: test/erasure_coding
path: |
/tmp/ec-test-logs/
test/erasure_coding/
if-no-files-found: warn

View File

@ -33,9 +33,8 @@ func TestECEncodingVolumeLocationTimingBug(t *testing.T) {
}
// Create temporary directory for test data
testDir, err := os.MkdirTemp("", "seaweedfs_ec_integration_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
// Start SeaweedFS cluster with multiple volume servers
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@ -244,9 +243,8 @@ func TestECEncodingMasterTimingRaceCondition(t *testing.T) {
}
// Create temporary directory for test data
testDir, err := os.MkdirTemp("", "seaweedfs_ec_race_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
// Start SeaweedFS cluster
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@ -783,9 +781,8 @@ func TestDiskAwareECRebalancing(t *testing.T) {
t.Skip("Skipping disk-aware integration test in short mode")
}
testDir, err := os.MkdirTemp("", "seaweedfs_disk_aware_ec_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
defer cancel()
@ -1217,9 +1214,8 @@ func TestECDiskTypeSupport(t *testing.T) {
t.Skip("Skipping disk type integration test in short mode")
}
testDir, err := os.MkdirTemp("", "seaweedfs_ec_disktype_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
defer cancel()
@ -1558,9 +1554,8 @@ func TestECDiskTypeMixedCluster(t *testing.T) {
t.Skip("Skipping mixed disk type integration test in short mode")
}
testDir, err := os.MkdirTemp("", "seaweedfs_ec_mixed_disktype_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
defer cancel()
@ -1748,9 +1743,8 @@ func TestEvacuationFallbackBehavior(t *testing.T) {
t.Skip("Skipping evacuation fallback test in short mode")
}
testDir, err := os.MkdirTemp("", "seaweedfs_evacuation_fallback_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
defer cancel()
@ -1842,9 +1836,8 @@ func TestCrossRackECPlacement(t *testing.T) {
t.Skip("Skipping cross-rack EC placement test in short mode")
}
testDir, err := os.MkdirTemp("", "seaweedfs_cross_rack_ec_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
defer cancel()
@ -2196,9 +2189,8 @@ func TestECEncodeReplicatedVolumeSync(t *testing.T) {
}
// Create temporary directory for test data
testDir, err := os.MkdirTemp("", "seaweedfs_ec_replica_sync_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
// Using t.TempDir() automatically preserves logs when tests fail
testDir := t.TempDir()
// Start SeaweedFS cluster
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)