From b8c7543a551697e6b9534ced5f6ed0f375ad22de Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 6 Nov 2020 23:32:31 +0100 Subject: [PATCH 1/5] check: Merge 'size could not be found' and 'not found in index' errors By construction these two errors always show up in pairs: 'size could not be found' is printed when the blob is not found in the repository index. That blob is also part of the `blobs` array. Later on, check iterates over that array and checks whether the blob is marked as existing. Which cannot be the case as that mark is generated by iterating over the repository index. The merged warning no longer reports the blob index within a file. That information could also be derived by printing the affected tree using `cat` and searching for the blob. --- internal/checker/checker.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 05c131055..6466693c4 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -656,7 +656,8 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { blobs = append(blobs, blobID) blobSize, found := c.repo.LookupBlobSize(blobID, restic.DataBlob) if !found { - errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %d size could not be found", node.Name, b)}) + debug.Log("tree %v references blob %v which isn't contained in index", id, blobID) + errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %v not found in index", node.Name, blobID)}) } size += uint64(blobSize) } @@ -686,10 +687,6 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { for _, blobID := range blobs { c.blobRefs.Lock() h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob} - if (c.blobRefs.M[h] & blobStatusExists) == 0 { - debug.Log("tree %v references blob %v which isn't contained in index", id, blobID) - errs = append(errs, Error{TreeID: id, BlobID: blobID, Err: errors.New("not found in index")}) - } c.blobRefs.M[h] |= blobStatusReferenced debug.Log("blob %v is referenced", blobID) c.blobRefs.Unlock() From 3500f9490c976af6dde00f40d449747f6d9b69df Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 6 Nov 2020 23:41:04 +0100 Subject: [PATCH 2/5] check: Simplify blob status tracking UnusedBlobs now directly reads the list of existing blobs from the repository index. This removes the need for the blobStatusExists flag, which in turn allows converting the blobRefs map into a BlobSet. --- cmd/restic/cmd_check.go | 2 +- internal/checker/checker.go | 36 +++++++++++++------------------- internal/checker/checker_test.go | 2 +- internal/checker/testing.go | 2 +- 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/cmd/restic/cmd_check.go b/cmd/restic/cmd_check.go index 529b2d71d..84eaa5ac4 100644 --- a/cmd/restic/cmd_check.go +++ b/cmd/restic/cmd_check.go @@ -255,7 +255,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { } if opts.CheckUnused { - for _, id := range chkr.UnusedBlobs() { + for _, id := range chkr.UnusedBlobs(gopts.ctx) { Verbosef("unused blob %v\n", id) errorsFound = true } diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 6466693c4..09a71f9de 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -25,8 +25,7 @@ type Checker struct { packs map[restic.ID]int64 blobRefs struct { sync.Mutex - // see flags below - M map[restic.BlobHandle]blobStatus + M restic.BlobSet } masterIndex *repository.MasterIndex @@ -34,13 +33,6 @@ type Checker struct { repo restic.Repository } -type blobStatus uint8 - -const ( - blobStatusExists blobStatus = 1 << iota - blobStatusReferenced -) - // New returns a new checker which runs on repo. func New(repo restic.Repository) *Checker { c := &Checker{ @@ -49,7 +41,7 @@ func New(repo restic.Repository) *Checker { repo: repo, } - c.blobRefs.M = make(map[restic.BlobHandle]blobStatus) + c.blobRefs.M = restic.NewBlobSet() return c } @@ -162,8 +154,6 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) { debug.Log("process blobs") cnt := 0 for blob := range res.Index.Each(wgCtx) { - h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - c.blobRefs.M[h] = blobStatusExists cnt++ if _, ok := packToIndex[blob.PackID]; !ok { @@ -529,9 +519,9 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha // even when a file references a tree blob c.blobRefs.Lock() h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob} - status := c.blobRefs.M[h] + blobReferenced := c.blobRefs.M.Has(h) c.blobRefs.Unlock() - if (status & blobStatusReferenced) != 0 { + if blobReferenced { continue } @@ -552,7 +542,7 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha loadCh = nil c.blobRefs.Lock() h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob} - c.blobRefs.M[h] |= blobStatusReferenced + c.blobRefs.M.Insert(h) c.blobRefs.Unlock() case j, ok := <-inCh: @@ -687,7 +677,7 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { for _, blobID := range blobs { c.blobRefs.Lock() h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob} - c.blobRefs.M[h] |= blobStatusReferenced + c.blobRefs.M.Insert(h) debug.Log("blob %v is referenced", blobID) c.blobRefs.Unlock() } @@ -696,15 +686,19 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { } // UnusedBlobs returns all blobs that have never been referenced. -func (c *Checker) UnusedBlobs() (blobs restic.BlobHandles) { +func (c *Checker) UnusedBlobs(ctx context.Context) (blobs restic.BlobHandles) { c.blobRefs.Lock() defer c.blobRefs.Unlock() debug.Log("checking %d blobs", len(c.blobRefs.M)) - for id, flags := range c.blobRefs.M { - if (flags & blobStatusReferenced) == 0 { - debug.Log("blob %v not referenced", id) - blobs = append(blobs, id) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + for blob := range c.repo.Index().Each(ctx) { + h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} + if !c.blobRefs.M.Has(h) { + debug.Log("blob %v not referenced", h) + blobs = append(blobs, h) } } diff --git a/internal/checker/checker_test.go b/internal/checker/checker_test.go index 8218a94fc..333e5573b 100644 --- a/internal/checker/checker_test.go +++ b/internal/checker/checker_test.go @@ -181,7 +181,7 @@ func TestUnreferencedBlobs(t *testing.T) { test.OKs(t, checkPacks(chkr)) test.OKs(t, checkStruct(chkr)) - blobs := chkr.UnusedBlobs() + blobs := chkr.UnusedBlobs(context.TODO()) sort.Sort(blobs) test.Equals(t, unusedBlobsBySnapshot, blobs) diff --git a/internal/checker/testing.go b/internal/checker/testing.go index 8e49b562c..c6d9a08ef 100644 --- a/internal/checker/testing.go +++ b/internal/checker/testing.go @@ -37,7 +37,7 @@ func TestCheckRepo(t testing.TB, repo restic.Repository) { } // unused blobs - blobs := chkr.UnusedBlobs() + blobs := chkr.UnusedBlobs(context.TODO()) if len(blobs) > 0 { t.Errorf("unused blobs found: %v", blobs) } From 6da66c15d88771c96a8f997308fc2dbc66165a15 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 6 Nov 2020 23:54:20 +0100 Subject: [PATCH 3/5] check: Simplify referenced blob tracking The result is identical as long as the context in not canceled. However, in that case the result is incomplete anyways. --- internal/checker/checker.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 09a71f9de..19501b4e6 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -520,6 +520,8 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha c.blobRefs.Lock() h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob} blobReferenced := c.blobRefs.M.Has(h) + // noop if already referenced + c.blobRefs.M.Insert(h) c.blobRefs.Unlock() if blobReferenced { continue @@ -540,10 +542,6 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha case loadCh <- nextTreeID: outstandingLoadTreeJobs++ loadCh = nil - c.blobRefs.Lock() - h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob} - c.blobRefs.M.Insert(h) - c.blobRefs.Unlock() case j, ok := <-inCh: if !ok { From 1f43cac12d3b3184a41535b291cefb868ccc63e1 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 7 Nov 2020 00:07:32 +0100 Subject: [PATCH 4/5] check: Only track data blobs when unused blobs should be reported This improves the memory usage of check a lot as it now only has to track tree blobs when run using the default parameters. --- cmd/restic/cmd_check.go | 2 +- internal/checker/checker.go | 33 +++++++++++++++--------- internal/checker/checker_test.go | 20 +++++++------- internal/checker/testing.go | 2 +- internal/repository/master_index_test.go | 2 +- 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/cmd/restic/cmd_check.go b/cmd/restic/cmd_check.go index 84eaa5ac4..774879490 100644 --- a/cmd/restic/cmd_check.go +++ b/cmd/restic/cmd_check.go @@ -193,7 +193,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { } } - chkr := checker.New(repo) + chkr := checker.New(repo, opts.CheckUnused) Verbosef("load indexes\n") hints, errs := chkr.LoadIndex(gopts.ctx) diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 19501b4e6..1e31ba986 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -27,6 +27,7 @@ type Checker struct { sync.Mutex M restic.BlobSet } + trackUnused bool masterIndex *repository.MasterIndex @@ -34,11 +35,12 @@ type Checker struct { } // New returns a new checker which runs on repo. -func New(repo restic.Repository) *Checker { +func New(repo restic.Repository, trackUnused bool) *Checker { c := &Checker{ packs: make(map[restic.ID]int64), masterIndex: repository.NewMasterIndex(), repo: repo, + trackUnused: trackUnused, } c.blobRefs.M = restic.NewBlobSet() @@ -626,8 +628,6 @@ func (c *Checker) Structure(ctx context.Context, errChan chan<- error) { func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { debug.Log("checking tree %v", id) - var blobs []restic.ID - for _, node := range tree.Nodes { switch node.Type { case "file": @@ -641,7 +641,6 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %d has null ID", node.Name, b)}) continue } - blobs = append(blobs, blobID) blobSize, found := c.repo.LookupBlobSize(blobID, restic.DataBlob) if !found { debug.Log("tree %v references blob %v which isn't contained in index", id, blobID) @@ -649,6 +648,21 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { } size += uint64(blobSize) } + + if c.trackUnused { + // loop a second time to keep the locked section as short as possible + c.blobRefs.Lock() + for _, blobID := range node.Content { + if blobID.IsNull() { + continue + } + h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob} + c.blobRefs.M.Insert(h) + debug.Log("blob %v is referenced", blobID) + } + c.blobRefs.Unlock() + } + case "dir": if node.Subtree == nil { errs = append(errs, Error{TreeID: id, Err: errors.Errorf("dir node %q has no subtree", node.Name)}) @@ -672,19 +686,14 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) { } } - for _, blobID := range blobs { - c.blobRefs.Lock() - h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob} - c.blobRefs.M.Insert(h) - debug.Log("blob %v is referenced", blobID) - c.blobRefs.Unlock() - } - return errs } // UnusedBlobs returns all blobs that have never been referenced. func (c *Checker) UnusedBlobs(ctx context.Context) (blobs restic.BlobHandles) { + if !c.trackUnused { + panic("only works when tracking blob references") + } c.blobRefs.Lock() defer c.blobRefs.Unlock() diff --git a/internal/checker/checker_test.go b/internal/checker/checker_test.go index 333e5573b..f8efd05e8 100644 --- a/internal/checker/checker_test.go +++ b/internal/checker/checker_test.go @@ -61,7 +61,7 @@ func TestCheckRepo(t *testing.T) { repo := repository.TestOpenLocal(t, repodir) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -87,7 +87,7 @@ func TestMissingPack(t *testing.T) { } test.OK(t, repo.Backend().Remove(context.TODO(), packHandle)) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -123,7 +123,7 @@ func TestUnreferencedPack(t *testing.T) { } test.OK(t, repo.Backend().Remove(context.TODO(), indexHandle)) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -168,7 +168,7 @@ func TestUnreferencedBlobs(t *testing.T) { sort.Sort(unusedBlobsBySnapshot) - chkr := checker.New(repo) + chkr := checker.New(repo, true) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -241,7 +241,7 @@ func TestModifiedIndex(t *testing.T) { t.Fatal(err) } - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) == 0 { t.Fatalf("expected errors not found") @@ -264,7 +264,7 @@ func TestDuplicatePacksInIndex(t *testing.T) { repo := repository.TestOpenLocal(t, repodir) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(hints) == 0 { t.Fatalf("did not get expected checker hints for duplicate packs in indexes") @@ -336,7 +336,7 @@ func TestCheckerModifiedData(t *testing.T) { checkRepo := repository.New(beError) test.OK(t, checkRepo.SearchKey(context.TODO(), test.TestPassword, 5, "")) - chkr := checker.New(checkRepo) + chkr := checker.New(checkRepo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { @@ -398,7 +398,7 @@ func TestCheckerNoDuplicateTreeDecodes(t *testing.T) { loadedTrees: restic.NewIDSet(), } - chkr := checker.New(checkRepo) + chkr := checker.New(checkRepo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { t.Fatalf("expected no errors, got %v: %v", len(errs), errs) @@ -509,7 +509,7 @@ func TestCheckerBlobTypeConfusion(t *testing.T) { UnblockChannel: make(chan struct{}), } - chkr := checker.New(delayRepo) + chkr := checker.New(delayRepo, false) go func() { <-ctx.Done() @@ -544,7 +544,7 @@ func loadBenchRepository(t *testing.B) (*checker.Checker, restic.Repository, fun repo := repository.TestOpenLocal(t, repodir) - chkr := checker.New(repo) + chkr := checker.New(repo, false) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) > 0 { defer cleanup() diff --git a/internal/checker/testing.go b/internal/checker/testing.go index c6d9a08ef..6c5be84e2 100644 --- a/internal/checker/testing.go +++ b/internal/checker/testing.go @@ -9,7 +9,7 @@ import ( // TestCheckRepo runs the checker on repo. func TestCheckRepo(t testing.TB, repo restic.Repository) { - chkr := New(repo) + chkr := New(repo, true) hints, errs := chkr.LoadIndex(context.TODO()) if len(errs) != 0 { diff --git a/internal/repository/master_index_test.go b/internal/repository/master_index_test.go index 729f088ac..0d56cd097 100644 --- a/internal/repository/master_index_test.go +++ b/internal/repository/master_index_test.go @@ -360,7 +360,7 @@ func TestIndexSave(t *testing.T) { } } - checker := checker.New(repo) + checker := checker.New(repo, false) hints, errs := checker.LoadIndex(context.TODO()) for _, h := range hints { t.Logf("hint: %v\n", h) From 022dc35be968d79949e62942c71dad777d8c3c05 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 15 Nov 2020 19:02:51 +0100 Subject: [PATCH 5/5] Add changelog --- changelog/unreleased/pull-3099 | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changelog/unreleased/pull-3099 diff --git a/changelog/unreleased/pull-3099 b/changelog/unreleased/pull-3099 new file mode 100644 index 000000000..3eb09e38d --- /dev/null +++ b/changelog/unreleased/pull-3099 @@ -0,0 +1,6 @@ +Enhancement: Reduce memory usage of check command + +The check command now requires less memory if it is run with the +`--check-unused` option. + +https://github.com/restic/restic/pull/3099