Merge pull request #3099 from MichaelEischer/check-less-memory

2020-11-16 10:11:25 +01:00 · 2020-11-16 10:11:25 +01:00 · 5f3b802ee7
parent 45ba456291 022dc35be9
commit 5f3b802ee7
6 changed files with 60 additions and 56 deletions
--- a/changelog/unreleased/pull-3099
+++ b/changelog/unreleased/pull-3099
@ -0,0 +1,6 @@
 Enhancement: Reduce memory usage of check command
 The check command now requires less memory if it is run with the
 `--check-unused` option.
 https://github.com/restic/restic/pull/3099
--- a/cmd/restic/cmd_check.go
+++ b/cmd/restic/cmd_check.go
@ -193,7 +193,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 		}
 	}
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, opts.CheckUnused)
 	Verbosef("load indexes\n")
 	hints, errs := chkr.LoadIndex(gopts.ctx)
@ -255,7 +255,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 	}
 	if opts.CheckUnused {
-		for _, id := range chkr.UnusedBlobs() {
+		for _, id := range chkr.UnusedBlobs(gopts.ctx) {
 			Verbosef("unused blob %v\n", id)
 			errorsFound = true
 		}
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@ -25,31 +25,25 @@ type Checker struct {
 	packs    map[restic.ID]int64
 	blobRefs struct {
 		sync.Mutex
-		// see flags below
+		M restic.BlobSet
 		M map[restic.BlobHandle]blobStatus
 	}
 	trackUnused bool
 	masterIndex *repository.MasterIndex
 	repo restic.Repository
 }
 type blobStatus uint8
 const (
 	blobStatusExists blobStatus = 1 << iota
 	blobStatusReferenced
 )
 // New returns a new checker which runs on repo.
-func New(repo restic.Repository) *Checker {
+func New(repo restic.Repository, trackUnused bool) *Checker {
 	c := &Checker{
 		packs:       make(map[restic.ID]int64),
 		masterIndex: repository.NewMasterIndex(),
 		repo:        repo,
 		trackUnused: trackUnused,
 	}
-	c.blobRefs.M = make(map[restic.BlobHandle]blobStatus)
+	c.blobRefs.M = restic.NewBlobSet()
 	return c
 }
@ -162,8 +156,6 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 			debug.Log("process blobs")
 			cnt := 0
 			for blob := range res.Index.Each(wgCtx) {
 				h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
 				c.blobRefs.M[h] = blobStatusExists
 				cnt++
 				if _, ok := packToIndex[blob.PackID]; !ok {
@ -529,9 +521,11 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha
 			// even when a file references a tree blob
 			c.blobRefs.Lock()
 			h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob}
-			status := c.blobRefs.M[h]
+			blobReferenced := c.blobRefs.M.Has(h)
 			// noop if already referenced
 			c.blobRefs.M.Insert(h)
 			c.blobRefs.Unlock()
-			if (status & blobStatusReferenced) != 0 {
+			if blobReferenced {
 				continue
 			}
@ -550,10 +544,6 @@ func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderCha
 		case loadCh <- nextTreeID:
 			outstandingLoadTreeJobs++
 			loadCh = nil
 			c.blobRefs.Lock()
 			h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob}
 			c.blobRefs.M[h] |= blobStatusReferenced
 			c.blobRefs.Unlock()
 		case j, ok := <-inCh:
 			if !ok {
@ -638,8 +628,6 @@ func (c *Checker) Structure(ctx context.Context, errChan chan<- error) {
 func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) {
 	debug.Log("checking tree %v", id)
 	var blobs []restic.ID
 	for _, node := range tree.Nodes {
 		switch node.Type {
 		case "file":
@ -653,13 +641,28 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) {
 					errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %d has null ID", node.Name, b)})
 					continue
 				}
 				blobs = append(blobs, blobID)
 				blobSize, found := c.repo.LookupBlobSize(blobID, restic.DataBlob)
 				if !found {
-					errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %d size could not be found", node.Name, b)})
+					debug.Log("tree %v references blob %v which isn't contained in index", id, blobID)
 					errs = append(errs, Error{TreeID: id, Err: errors.Errorf("file %q blob %v not found in index", node.Name, blobID)})
 				}
 				size += uint64(blobSize)
 			}
 			if c.trackUnused {
 				// loop a second time to keep the locked section as short as possible
 				c.blobRefs.Lock()
 				for _, blobID := range node.Content {
 					if blobID.IsNull() {
 						continue
 					}
 					h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob}
 					c.blobRefs.M.Insert(h)
 					debug.Log("blob %v is referenced", blobID)
 				}
 				c.blobRefs.Unlock()
 			}
 		case "dir":
 			if node.Subtree == nil {
 				errs = append(errs, Error{TreeID: id, Err: errors.Errorf("dir node %q has no subtree", node.Name)})
@ -683,31 +686,26 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) {
 		}
 	}
 	for _, blobID := range blobs {
 		c.blobRefs.Lock()
 		h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob}
 		if (c.blobRefs.M[h] & blobStatusExists) == 0 {
 			debug.Log("tree %v references blob %v which isn't contained in index", id, blobID)
 			errs = append(errs, Error{TreeID: id, BlobID: blobID, Err: errors.New("not found in index")})
 		}
 		c.blobRefs.M[h] |= blobStatusReferenced
 		debug.Log("blob %v is referenced", blobID)
 		c.blobRefs.Unlock()
 	}
 	return errs
 }
 // UnusedBlobs returns all blobs that have never been referenced.
-func (c *Checker) UnusedBlobs() (blobs restic.BlobHandles) {
+func (c *Checker) UnusedBlobs(ctx context.Context) (blobs restic.BlobHandles) {
 	if !c.trackUnused {
 		panic("only works when tracking blob references")
 	}
 	c.blobRefs.Lock()
 	defer c.blobRefs.Unlock()
 	debug.Log("checking %d blobs", len(c.blobRefs.M))
-	for id, flags := range c.blobRefs.M {
+	ctx, cancel := context.WithCancel(ctx)
-		if (flags & blobStatusReferenced) == 0 {
+	defer cancel()
-			debug.Log("blob %v not referenced", id)
+
-			blobs = append(blobs, id)
+	for blob := range c.repo.Index().Each(ctx) {
 		h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
 		if !c.blobRefs.M.Has(h) {
 			debug.Log("blob %v not referenced", h)
 			blobs = append(blobs, h)
 		}
 	}
--- a/internal/checker/checker_test.go
+++ b/internal/checker/checker_test.go
@ -61,7 +61,7 @@ func TestCheckRepo(t *testing.T) {
 	repo := repository.TestOpenLocal(t, repodir)
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
@ -87,7 +87,7 @@ func TestMissingPack(t *testing.T) {
 	}
 	test.OK(t, repo.Backend().Remove(context.TODO(), packHandle))
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
@ -123,7 +123,7 @@ func TestUnreferencedPack(t *testing.T) {
 	}
 	test.OK(t, repo.Backend().Remove(context.TODO(), indexHandle))
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
@ -168,7 +168,7 @@ func TestUnreferencedBlobs(t *testing.T) {
 	sort.Sort(unusedBlobsBySnapshot)
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, true)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
@ -181,7 +181,7 @@ func TestUnreferencedBlobs(t *testing.T) {
 	test.OKs(t, checkPacks(chkr))
 	test.OKs(t, checkStruct(chkr))
-	blobs := chkr.UnusedBlobs()
+	blobs := chkr.UnusedBlobs(context.TODO())
 	sort.Sort(blobs)
 	test.Equals(t, unusedBlobsBySnapshot, blobs)
@ -241,7 +241,7 @@ func TestModifiedIndex(t *testing.T) {
 		t.Fatal(err)
 	}
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) == 0 {
 		t.Fatalf("expected errors not found")
@ -264,7 +264,7 @@ func TestDuplicatePacksInIndex(t *testing.T) {
 	repo := repository.TestOpenLocal(t, repodir)
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(hints) == 0 {
 		t.Fatalf("did not get expected checker hints for duplicate packs in indexes")
@ -336,7 +336,7 @@ func TestCheckerModifiedData(t *testing.T) {
 	checkRepo := repository.New(beError)
 	test.OK(t, checkRepo.SearchKey(context.TODO(), test.TestPassword, 5, ""))
-	chkr := checker.New(checkRepo)
+	chkr := checker.New(checkRepo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
@ -398,7 +398,7 @@ func TestCheckerNoDuplicateTreeDecodes(t *testing.T) {
 		loadedTrees: restic.NewIDSet(),
 	}
-	chkr := checker.New(checkRepo)
+	chkr := checker.New(checkRepo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
@ -509,7 +509,7 @@ func TestCheckerBlobTypeConfusion(t *testing.T) {
 		UnblockChannel: make(chan struct{}),
 	}
-	chkr := checker.New(delayRepo)
+	chkr := checker.New(delayRepo, false)
 	go func() {
 		<-ctx.Done()
@ -544,7 +544,7 @@ func loadBenchRepository(t *testing.B) (*checker.Checker, restic.Repository, fun
 	repo := repository.TestOpenLocal(t, repodir)
-	chkr := checker.New(repo)
+	chkr := checker.New(repo, false)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		defer cleanup()
--- a/internal/checker/testing.go
+++ b/internal/checker/testing.go
@ -9,7 +9,7 @@ import (
 // TestCheckRepo runs the checker on repo.
 func TestCheckRepo(t testing.TB, repo restic.Repository) {
-	chkr := New(repo)
+	chkr := New(repo, true)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) != 0 {
@ -37,7 +37,7 @@ func TestCheckRepo(t testing.TB, repo restic.Repository) {
 	}
 	// unused blobs
-	blobs := chkr.UnusedBlobs()
+	blobs := chkr.UnusedBlobs(context.TODO())
 	if len(blobs) > 0 {
 		t.Errorf("unused blobs found: %v", blobs)
 	}
--- a/internal/repository/master_index_test.go
+++ b/internal/repository/master_index_test.go
@ -360,7 +360,7 @@ func TestIndexSave(t *testing.T) {
 		}
 	}
-	checker := checker.New(repo)
+	checker := checker.New(repo, false)
 	hints, errs := checker.LoadIndex(context.TODO())
 	for _, h := range hints {
 		t.Logf("hint: %v\n", h)