diff --git a/changelog/unreleased/pull-3099 b/changelog/unreleased/pull-3099 new file mode 100644 index 000000000..3eb09e38d --- /dev/null +++ b/changelog/unreleased/pull-3099 @@ -0,0 +1,6 @@ +Enhancement: Reduce memory usage of check command + +The check command now requires less memory if it is run with the +`--check-unused` option. + +https://github.com/restic/restic/pull/3099 diff --git a/cmd/restic/cmd_check.go b/cmd/restic/cmd_check.go index 529b2d71d..774879490 100644 --- a/cmd/restic/cmd_check.go +++ b/cmd/restic/cmd_check.go @@ -193,7 +193,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { } } - chkr := checker.New(repo) + chkr := checker.New(repo, opts.CheckUnused) Verbosef("load indexes\n") hints, errs := chkr.LoadIndex(gopts.ctx) @@ -255,7 +255,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { } if opts.CheckUnused { - for _, id := range chkr.UnusedBlobs() { + for _, id := range chkr.UnusedBlobs(gopts.ctx) { Verbosef("unused blob %v\n", id) errorsFound = true } diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 05c131055..1e31ba986 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -25,31 +25,25 @@ type Checker struct { packs map[restic.ID]int64 blobRefs struct { sync.Mutex - // see flags below - M map[restic.BlobHandle]blobStatus + M restic.BlobSet } + trackUnused bool masterIndex *repository.MasterIndex repo restic.Repository } -type blobStatus uint8 - -const ( - blobStatusExists blobStatus = 1 << iota - blobStatusReferenced -) - // New returns a new checker which runs on repo. -func New(repo restic.Repository) *Checker { +func New(repo restic.Repository, trackUnused bool) *Checker { c := &Checker{ packs: make(map[restic.ID]int64), masterIndex: repository.NewMasterIndex(), repo: repo, + trackUnused: trackUnused, } - c.blobRefs.M = make(map[restic.BlobHandle]blobStatus) + c.blobRefs.M = restic.NewBlobSet() return c } @@ -162,8 +156,6 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) { debug.Log("process blobs") cnt := 0 for blob := range res.Index.Each(wgCtx) { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - c.blobRefs.M[h] = blobStatusExists cnt++ if _, ok := packToIndex[blob.PackID]; !ok { @@ -529,9 +521,11 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha // even when a file references a tree blob c.blobRefs.Lock() h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob} - status := c.blobRefs.M[h] + blobReferenced := c.blobRefs.M.Has(h) + // noop if already referenced + c.blobRefs.M.Insert(h) c.blobRefs.Unlock() - if (status & blobStatusReferenced) != 0 { + if blobReferenced { continue } @@ -550,10 +544,6 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha case loadCh <- nextTreeID: outstandingLoadTreeJobs++ loadCh = nil - c.blobRefs.Lock() - h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob} - c.blobRefs.M[h] |= blobStatusReferenced - c.blobRefs.Unlock() case j, ok := <-inCh: if !ok { @@ -638,8 +628,6 @@ func (c *Checker) Structure(ctx context.Context, errChan chan<- error) { func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { debug.Log("checking tree %v", id) - var blobs []restic.ID - for _, node := range tree.Nodes { switch node.Type { case "file": @@ -653,13 +641,28 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %d has null ID", node.Name, b)}) continue } - blobs = append(blobs, blobID) blobSize, found := c.repo.LookupBlobSize(blobID, restic.DataBlob) if !found { - errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %d size could not be found", node.Name, b)}) + debug.Log("tree %v references blob %v which isn't contained in index", id, blobID) + errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %v not found in index", node.Name, blobID)}) } size += uint64(blobSize) } + + if c.trackUnused { + // loop a second time to keep the locked section as short as possible + c.blobRefs.Lock() + for _, blobID := range node.Content { + if blobID.IsNull() { + continue + } + h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob} + c.blobRefs.M.Insert(h) + debug.Log("blob %v is referenced", blobID) + } + c.blobRefs.Unlock() + } + case "dir": if node.Subtree == nil { errs = append(errs, Error{TreeID: id, Err: errors.Errorf("dir node %q has no subtree", node.Name)}) @@ -683,31 +686,26 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { } } - for _, blobID := range blobs { - c.blobRefs.Lock() - h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob} - if (c.blobRefs.M[h] & blobStatusExists) == 0 { - debug.Log("tree %v references blob %v which isn't contained in index", id, blobID) - errs = append(errs, Error{TreeID: id, BlobID: blobID, Err: errors.New("not found in index")}) - } - c.blobRefs.M[h] |= blobStatusReferenced - debug.Log("blob %v is referenced", blobID) - c.blobRefs.Unlock() - } - return errs } // UnusedBlobs returns all blobs that have never been referenced. -func (c *Checker) UnusedBlobs() (blobs restic.BlobHandles) { +func (c *Checker) UnusedBlobs(ctx context.Context) (blobs restic.BlobHandles) { + if !c.trackUnused { + panic("only works when tracking blob references") + } c.blobRefs.Lock() defer c.blobRefs.Unlock() debug.Log("checking %d blobs", len(c.blobRefs.M)) - for id, flags := range c.blobRefs.M { - if (flags & blobStatusReferenced) == 0 { - debug.Log("blob %v not referenced", id) - blobs = append(blobs, id) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + for blob := range c.repo.Index().Each(ctx) { + h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} + if !c.blobRefs.M.Has(h) { + debug.Log("blob %v not referenced", h) + blobs = append(blobs, h) } } diff --git a/internal/checker/checker_test.go b/internal/checker/checker_test.go index 8218a94fc..f8efd05e8 100644 --- a/internal/checker/checker_test.go +++ b/internal/checker/checker_test.go @@ -61,7 +61,7 @@ func TestCheckRepo(t *testing.T) { repo := repository.TestOpenLocal(t, repodir) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -87,7 +87,7 @@ func TestMissingPack(t *testing.T) { } test.OK(t, repo.Backend().Remove(context.TODO(), packHandle)) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -123,7 +123,7 @@ func TestUnreferencedPack(t *testing.T) { } test.OK(t, repo.Backend().Remove(context.TODO(), indexHandle)) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -168,7 +168,7 @@ func TestUnreferencedBlobs(t *testing.T) { sort.Sort(unusedBlobsBySnapshot) - chkr := checker.New(repo) + chkr := checker.New(repo, true) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -181,7 +181,7 @@ func TestUnreferencedBlobs(t *testing.T) { test.OKs(t, checkPacks(chkr)) test.OKs(t, checkStruct(chkr)) - blobs := chkr.UnusedBlobs() + blobs := chkr.UnusedBlobs(context.TODO()) sort.Sort(blobs) test.Equals(t, unusedBlobsBySnapshot, blobs) @@ -241,7 +241,7 @@ func TestModifiedIndex(t *testing.T) { t.Fatal(err) } - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) == 0 { t.Fatalf("expected errors not found") @@ -264,7 +264,7 @@ func TestDuplicatePacksInIndex(t *testing.T) { repo := repository.TestOpenLocal(t, repodir) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(hints) == 0 { t.Fatalf("did not get expected checker hints for duplicate packs in indexes") @@ -336,7 +336,7 @@ func TestCheckerModifiedData(t *testing.T) { checkRepo := repository.New(beError) test.OK(t, checkRepo.SearchKey(context.TODO(), test.TestPassword, 5, "")) - chkr := checker.New(checkRepo) + chkr := checker.New(checkRepo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { @@ -398,7 +398,7 @@ func TestCheckerNoDuplicateTreeDecodes(t *testing.T) { loadedTrees: restic.NewIDSet(), } - chkr := checker.New(checkRepo) + chkr := checker.New(checkRepo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -509,7 +509,7 @@ func TestCheckerBlobTypeConfusion(t *testing.T) { UnblockChannel: make(chan struct{}), } - chkr := checker.New(delayRepo) + chkr := checker.New(delayRepo, false) go func() { <-ctx.Done() @@ -544,7 +544,7 @@ func loadBenchRepository(t *testing.B) (*checker.Checker, restic.Repository, fun repo := repository.TestOpenLocal(t, repodir) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { defer cleanup() diff --git a/internal/checker/testing.go b/internal/checker/testing.go index 8e49b562c..6c5be84e2 100644 --- a/internal/checker/testing.go +++ b/internal/checker/testing.go @@ -9,7 +9,7 @@ import ( // TestCheckRepo runs the checker on repo. func TestCheckRepo(t testing.TB, repo restic.Repository) { - chkr := New(repo) + chkr := New(repo, true) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) != 0 { @@ -37,7 +37,7 @@ func TestCheckRepo(t testing.TB, repo restic.Repository) { } // unused blobs - blobs := chkr.UnusedBlobs() + blobs := chkr.UnusedBlobs(context.TODO()) if len(blobs) > 0 { t.Errorf("unused blobs found: %v", blobs) } diff --git a/internal/repository/master_index_test.go b/internal/repository/master_index_test.go index 729f088ac..0d56cd097 100644 --- a/internal/repository/master_index_test.go +++ b/internal/repository/master_index_test.go @@ -360,7 +360,7 @@ func TestIndexSave(t *testing.T) { } } - checker := checker.New(repo) + checker := checker.New(repo, false) hints, errs := checker.LoadIndex(context.TODO()) for _, h := range hints { t.Logf("hint: %v\n", h)