From c1a3de4a6e7e373f80e65f218c109b5a09cab8a5 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Tue, 3 Nov 2020 10:53:38 +0100 Subject: [PATCH] Refactor max-unused calculation, add `unlimited` option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a callback to the PruneOptions struct which calculates the number of bytes allowed to be unused after prune is done. This way, the logic is closer to the option parsing code. Also, add an explicit option `unlimited` for the use case when storage does not matter but bandwidth and time do. Internally, this sets the maximum number of unused bytes to MaxUint64. Rework the documentation slightly so that no more "packs" are mentioned and it talks about "files" instead. Make it clear in the documentation that the percentage given to `--max-unused` is relative to the whole repository size after pruning is done. If specified, it must be below 100%, otherwise the repository would contain 100% of unused data, which is pointless. I had a hard time coming up with the correct formula to calculate the maximum number of unused bytes based on the number of used bytes. For a fraction `p` (0 ≤ p < 1), a repo with `u` bytes used, and the number of unused bytes `x` the following holds: x ≤ p * (u+x) ⇔ x ≤ p*u + p*x ⇔ x - p*x ≤ p*u ⇔ x * (1-p) ≤ p*u ⇔ x ≤ p/(1-p) * u --- cmd/restic/cmd_prune.go | 73 ++++++++++++++++++++-------------- cmd/restic/integration_test.go | 12 +++--- doc/060_forget.rst | 73 ++++++++++++++++++++-------------- 3 files changed, 94 insertions(+), 64 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 4f83a9874..0ff4600b8 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -1,8 +1,10 @@ package main import ( + "math" "sort" "strconv" + "strings" "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" @@ -39,9 +41,8 @@ Exit status is 0 if the command was successful, and non-zero if there was any er type PruneOptions struct { DryRun bool - MaxUnused string - MaxUnusedPercent float64 // set if MaxUnused is a percentage - MaxUnusedBytes uint64 // set if MaxUnused is an absolute number of bytes + MaxUnused string + maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused MaxRepackSize string MaxRepackBytes uint64 @@ -60,7 +61,7 @@ func init() { func addPruneOptions(c *cobra.Command) { f := c.Flags() - f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)") + f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')") f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)") f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") } @@ -74,27 +75,46 @@ func verifyPruneOptions(opts *PruneOptions) error { opts.MaxRepackBytes = uint64(size) } - length := len(opts.MaxUnused) - if length == 0 { - return nil + maxUnused := strings.TrimSpace(opts.MaxUnused) + if maxUnused == "" { + return errors.Fatalf("invalid value for --max-unused: %q", opts.MaxUnused) } - var err error - if opts.MaxUnused[length-1] == '%' { - opts.MaxUnusedPercent, err = strconv.ParseFloat(opts.MaxUnused[:length-1], 64) - opts.MaxUnusedBytes = ^uint64(0) - } else { - var size int64 - size, err = parseSizeStr(opts.MaxUnused) - opts.MaxUnusedPercent = 100.0 - opts.MaxUnusedBytes = uint64(size) - } - if err != nil { - return err - } + // parse MaxUnused either as unlimited, a percentage, or an absolute number of bytes + switch { + case maxUnused == "unlimited": + opts.maxUnusedBytes = func(used uint64) uint64 { + return math.MaxUint64 + } - if opts.MaxUnusedPercent < 0.0 || opts.MaxUnusedPercent > 100.0 { - return errors.Fatalf("--max-unused-percent should be between 0 and 100. Given value: %f", opts.MaxUnusedPercent) + case strings.HasSuffix(maxUnused, "%"): + maxUnused = strings.TrimSuffix(maxUnused, "%") + p, err := strconv.ParseFloat(maxUnused, 64) + if err != nil { + return errors.Fatalf("invalid percentage %q passed for --max-unused: %v", opts.MaxUnused, err) + } + + if p < 0 { + return errors.Fatal("percentage for --max-unused must be positive") + } + + if p >= 100 { + return errors.Fatal("percentage for --max-unused must be below 100%") + } + + opts.maxUnusedBytes = func(used uint64) uint64 { + return uint64(p / (100 - p) * float64(used)) + } + + default: + size, err := parseSizeStr(maxUnused) + if err != nil { + return errors.Fatalf("invalid number of bytes %q for --max-unused: %v", opts.MaxUnused, err) + } + + opts.maxUnusedBytes = func(used uint64) uint64 { + return uint64(size) + } } return nil @@ -344,13 +364,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB repackAllPacksWithDuplicates := true - maxUnusedSizeAfter := opts.MaxUnusedBytes - if opts.MaxUnusedPercent < 100.0 { - maxUnusedSizePercent := uint64(opts.MaxUnusedPercent / (100.0 - opts.MaxUnusedPercent) * float64(stats.size.used)) - if maxUnusedSizePercent < maxUnusedSizeAfter { - maxUnusedSizeAfter = maxUnusedSizePercent - } - } + // calculate limit for number of unused bytes in the repo after repacking + maxUnusedSizeAfter := opts.maxUnusedBytes(stats.size.used) // Sort repackCandidates such that packs with highest ratio unused/used space are picked first. // This is equivalent to sorting by unused / total space. diff --git a/cmd/restic/integration_test.go b/cmd/restic/integration_test.go index 6aeeab634..789240e0c 100644 --- a/cmd/restic/integration_test.go +++ b/cmd/restic/integration_test.go @@ -1387,25 +1387,25 @@ func TestCheckRestoreNoLock(t *testing.T) { func TestPrune(t *testing.T) { t.Run("0", func(t *testing.T) { - opts := PruneOptions{MaxUnusedPercent: 0.0} + opts := PruneOptions{MaxUnused: "0%"} checkOpts := CheckOptions{ReadData: true, CheckUnused: true} testPrune(t, opts, checkOpts) }) t.Run("50", func(t *testing.T) { - opts := PruneOptions{MaxUnusedPercent: 50.0} + opts := PruneOptions{MaxUnused: "50%"} checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) - t.Run("100", func(t *testing.T) { - opts := PruneOptions{MaxUnusedPercent: 100.0} + t.Run("unlimited", func(t *testing.T) { + opts := PruneOptions{MaxUnused: "unlimited"} checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) t.Run("CachableOnly", func(t *testing.T) { - opts := PruneOptions{RepackCachableOnly: true} + opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true} checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) @@ -1436,7 +1436,7 @@ func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) { rtest.OK(t, runCheck(checkOpts, env.gopts, nil)) } -var pruneDefaultOptions = PruneOptions{MaxUnusedPercent: 1.5} +var pruneDefaultOptions = PruneOptions{MaxUnused: "5%"} func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet { r, err := OpenRepository(gopts) diff --git a/doc/060_forget.rst b/doc/060_forget.rst index 2df82af8b..08381f180 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -310,39 +310,54 @@ Customize pruning To understand the custom options, we first explain how the pruning process works: -- First all snapshots and directories within snapshots are scanned to determine - which data is still in use. -- Then for all pack files ``prune`` finds out if the file is fully used, partly - used or completely unused. -- Completely unused packs are marked for deletion. Fully used packs are kept. - A partially used pack is either kept or marked for repacking depending on user - options. - Note that for repacking, restic must download the file from the repository - storage and reupload the needed data in the repository. This can be very - time-consuming for remote repositories. -- After deciding what to do, ``prune`` will actually perform the repack, modify - the index according to the changes and delete the obsolete files. +1. All snapshots and directories within snapshots are scanned to determine + which data is still in use. +2. For all files in the repository, restic finds out if the file is fully + used, partly used or completely unused. +3. Completely unused files are marked for deletion. Fully used files are kept. + A partially used file is either kept or marked for repacking depending on user + options. + + Note that for repacking, restic must download the file from the repository + storage and re-upload the needed data in the repository. This can be very + time-consuming for remote repositories. +4. After deciding what to do, ``prune`` will actually perform the repack, modify + the index according to the changes and delete the obsolete files. The ``prune`` command accepts the following options: - ``--max-unused limit`` allow unused data up to the specified limit within the repository. - This allows restic to keep partly used packs instead of repacking them. - The limit can be specified as size, e.g. "200M" or in percentage with respect to the total - repository size, e.g. "0.5%". - ``prune`` tries to repack as little data as possible while still ensuring this + This allows restic to keep partly used files instead of repacking them. + + The limit can be specified in several ways: + + * As an absolute size (e.g. ``200M``). If you want to minimize the space + used by your repository, pass ``0`` to this option. + * As a size relative to the total repo size (e.g. ``10%``). This means that + after prune, at most ``10%`` of the total data stored in the repo may be + unused data. If the repo after prune has as size of 500MB, then at most + 50MB may be unused. + * If the string ``unlimited`` is passed, there is no limit for partly + unused files. This means that as long as some data is still used within + a file stored in the repo, restic will just leave it there. Use this if + you want to minimize the time and bandwidth used by the ``prune`` + operation. + + Restic tries to repack as little data as possible while still ensuring this limit for unused data. - If you want to minimize the space used by your repository, use a value of 0%. - If you want to minimize the time and bandwidth used by the ``prune`` command, use a - high value. A value of 100% will not require any pack file to be repacked. - The default value is 5%. -- ``--max-repack-size size`` if set limits the total size of packs to repack. - As ``prune`` first stores all repacked packs and deletes the obsolete packs at the end, - this option might be handy if you expect many packs to be repacked and fear to run low - on storage. -- ``--repack-cacheable-only`` if set to true only pack files which are cacheable are repacked. - Other pack files are not repacked, if this option is set. - This allows a very fast repacking using only cached data. It can, however, imply that the - unused data in your repository exceeds the value given by ``--max-unused-percent``. - The default value is false. + +- ``--max-repack-size size`` if set limits the total size of files to repack. + As ``prune`` first stores all repacked files and deletes the obsolete files at the end, + this option might be handy if you expect many files to be repacked and fear to run low + on storage. + +- ``--repack-cacheable-only`` if set to true only files which contain + metadata and would be stored in the cache are repacked. Other pack files are + not repacked if this option is set. This allows a very fast repacking + using only cached data. It can, however, imply that the unused data in + your repository exceeds the value given by ``--max-unused``. + The default value is false. + - ``--dry-run`` only show what ``prune`` would do. + - ``--verbose`` increased verbosity shows additional statistics for ``prune``.