2020-01-12 18:23:35 +01:00
|
|
|
package lists
|
|
|
|
|
2022-08-03 22:10:07 +02:00
|
|
|
//go:generate go run github.com/abice/go-enum -f=$GOFILE --marshal --names
|
2020-01-12 18:23:35 +01:00
|
|
|
import (
|
2023-03-07 01:32:41 +01:00
|
|
|
"context"
|
2021-03-05 22:52:22 +01:00
|
|
|
"errors"
|
2020-01-12 18:23:35 +01:00
|
|
|
"fmt"
|
2020-02-13 18:12:59 +01:00
|
|
|
"net"
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
"github.com/sirupsen/logrus"
|
2021-10-13 22:45:32 +02:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
"github.com/0xERR0R/blocky/cache/stringcache"
|
2023-04-17 18:21:56 +02:00
|
|
|
"github.com/0xERR0R/blocky/config"
|
2021-08-25 22:06:34 +02:00
|
|
|
"github.com/0xERR0R/blocky/evt"
|
2023-03-07 01:32:41 +01:00
|
|
|
"github.com/0xERR0R/blocky/lists/parsers"
|
2021-08-25 22:06:34 +02:00
|
|
|
"github.com/0xERR0R/blocky/log"
|
2023-04-17 18:21:56 +02:00
|
|
|
"github.com/ThinkChaos/parcour"
|
|
|
|
"github.com/ThinkChaos/parcour/jobgroup"
|
2020-01-12 18:23:35 +01:00
|
|
|
)
|
|
|
|
|
2023-11-17 15:58:35 +01:00
|
|
|
const (
|
|
|
|
groupProducersBufferCap = 1000
|
|
|
|
regexWarningThreshold = 500
|
|
|
|
)
|
2022-05-16 21:32:16 +02:00
|
|
|
|
2021-09-09 22:57:05 +02:00
|
|
|
// ListCacheType represents the type of cached list ENUM(
|
|
|
|
// blacklist // is a list with blocked domains
|
|
|
|
// whitelist // is a list with whitelisted domains / IPs
|
|
|
|
// )
|
2020-03-06 23:00:14 +01:00
|
|
|
type ListCacheType int
|
|
|
|
|
2021-02-26 13:45:57 +01:00
|
|
|
// Matcher checks if a domain is in a list
|
2020-01-12 18:23:35 +01:00
|
|
|
type Matcher interface {
|
2021-02-26 13:45:57 +01:00
|
|
|
// Match matches passed domain name against cached list entries
|
2023-03-27 13:23:01 +02:00
|
|
|
Match(domain string, groupsToCheck []string) (groups []string)
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2021-02-26 13:45:57 +01:00
|
|
|
// ListCache generic cache of strings divided in groups
|
2020-01-12 18:23:35 +01:00
|
|
|
type ListCache struct {
|
2023-03-27 13:23:01 +02:00
|
|
|
groupedCache stringcache.GroupedStringCache
|
2023-11-17 15:58:35 +01:00
|
|
|
regexCache stringcache.GroupedStringCache
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-12-20 21:38:33 +01:00
|
|
|
cfg config.SourceLoading
|
2023-04-17 18:21:56 +02:00
|
|
|
listType ListCacheType
|
|
|
|
groupSources map[string][]config.BytesSource
|
|
|
|
downloader FileDownloader
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2023-03-12 22:14:10 +01:00
|
|
|
// LogConfig implements `config.Configurable`.
|
|
|
|
func (b *ListCache) LogConfig(logger *logrus.Entry) {
|
2023-11-17 15:58:35 +01:00
|
|
|
total := 0
|
|
|
|
regexes := 0
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
for group := range b.groupSources {
|
2023-03-27 13:23:01 +02:00
|
|
|
count := b.groupedCache.ElementCount(group)
|
|
|
|
logger.Infof("%s: %d entries", group, count)
|
|
|
|
total += count
|
2023-11-17 15:58:35 +01:00
|
|
|
regexes += b.regexCache.ElementCount(group)
|
|
|
|
}
|
|
|
|
|
|
|
|
if regexes > regexWarningThreshold {
|
|
|
|
logger.Warnf(
|
|
|
|
"REGEXES: %d !! High use of regexes is not recommended: they use a lot of memory and are very slow to search",
|
|
|
|
regexes,
|
|
|
|
)
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2023-03-12 22:14:10 +01:00
|
|
|
logger.Infof("TOTAL: %d entries", total)
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2021-02-26 13:45:57 +01:00
|
|
|
// NewListCache creates new list instance
|
2023-10-07 22:21:40 +02:00
|
|
|
func NewListCache(ctx context.Context,
|
2023-12-20 21:38:33 +01:00
|
|
|
t ListCacheType, cfg config.SourceLoading,
|
2023-04-17 18:21:56 +02:00
|
|
|
groupSources map[string][]config.BytesSource, downloader FileDownloader,
|
2022-12-26 22:11:45 +01:00
|
|
|
) (*ListCache, error) {
|
2023-11-17 15:58:35 +01:00
|
|
|
regexCache := stringcache.NewInMemoryGroupedRegexCache()
|
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
c := &ListCache{
|
2023-03-27 13:23:01 +02:00
|
|
|
groupedCache: stringcache.NewChainedGroupedCache(
|
2023-11-17 15:58:35 +01:00
|
|
|
regexCache,
|
|
|
|
stringcache.NewInMemoryGroupedWildcardCache(), // must be after regex which can contain '*'
|
|
|
|
stringcache.NewInMemoryGroupedStringCache(), // accepts all values, must be last
|
2023-03-27 13:23:01 +02:00
|
|
|
),
|
2023-11-17 15:58:35 +01:00
|
|
|
regexCache: regexCache,
|
2022-12-17 23:06:58 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
cfg: cfg,
|
|
|
|
listType: t,
|
|
|
|
groupSources: groupSources,
|
|
|
|
downloader: downloader,
|
2022-09-03 22:12:07 +02:00
|
|
|
}
|
2021-10-13 21:40:18 +02:00
|
|
|
|
2023-10-07 22:21:40 +02:00
|
|
|
err := cfg.StartPeriodicRefresh(ctx, c.refresh, func(err error) {
|
2023-04-17 18:21:56 +02:00
|
|
|
logger().WithError(err).Errorf("could not init %s", t)
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2021-10-13 21:30:14 +02:00
|
|
|
}
|
2021-10-13 21:40:18 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return c, nil
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func logger() *logrus.Entry {
|
2021-02-25 23:36:39 +01:00
|
|
|
return log.PrefixedLog("list_cache")
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
// Match matches passed domain name against cached list entries
|
|
|
|
func (b *ListCache) Match(domain string, groupsToCheck []string) (groups []string) {
|
|
|
|
return b.groupedCache.Contains(domain, groupsToCheck)
|
|
|
|
}
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
// Refresh triggers the refresh of a list
|
2023-09-09 19:30:55 +02:00
|
|
|
func (b *ListCache) Refresh() error {
|
|
|
|
return b.refresh(context.Background())
|
2023-04-17 18:21:56 +02:00
|
|
|
}
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
func (b *ListCache) refresh(ctx context.Context) error {
|
|
|
|
unlimitedGrp, _ := jobgroup.WithContext(ctx)
|
|
|
|
defer unlimitedGrp.Close()
|
2022-05-16 21:32:16 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
producersGrp := jobgroup.WithMaxConcurrency(unlimitedGrp, b.cfg.Concurrency)
|
|
|
|
defer producersGrp.Close()
|
2022-05-16 21:32:16 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
for group, sources := range b.groupSources {
|
|
|
|
group, sources := group, sources
|
2023-03-07 01:32:41 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
unlimitedGrp.Go(func(ctx context.Context) error {
|
|
|
|
err := b.createCacheForGroup(producersGrp, unlimitedGrp, group, sources)
|
|
|
|
if err != nil {
|
|
|
|
count := b.groupedCache.ElementCount(group)
|
2022-05-16 21:32:16 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
logger := logger().WithFields(logrus.Fields{
|
|
|
|
"group": group,
|
|
|
|
"total_count": count,
|
|
|
|
})
|
2023-03-07 01:32:41 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
if count == 0 {
|
|
|
|
logger.Warn("Populating of group cache failed, cache will be empty until refresh succeeds")
|
|
|
|
} else {
|
|
|
|
logger.Warn("Populating of group cache failed, using existing cache, if any")
|
|
|
|
}
|
2023-03-07 01:32:41 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return err
|
2023-03-07 01:32:41 +01:00
|
|
|
}
|
2021-01-04 23:45:09 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
count := b.groupedCache.ElementCount(group)
|
2022-05-16 21:32:16 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
evt.Bus().Publish(evt.BlockingCacheGroupChanged, b.listType, group, count)
|
2022-05-10 09:09:50 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
logger().WithFields(logrus.Fields{
|
|
|
|
"group": group,
|
|
|
|
"total_count": count,
|
|
|
|
}).Info("group import finished")
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return nil
|
|
|
|
})
|
2023-03-07 01:32:41 +01:00
|
|
|
}
|
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return unlimitedGrp.Wait()
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
func (b *ListCache) createCacheForGroup(
|
|
|
|
producersGrp, consumersGrp jobgroup.JobGroup, group string, sources []config.BytesSource,
|
|
|
|
) error {
|
|
|
|
groupFactory := b.groupedCache.Refresh(group)
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
producers := parcour.NewProducersWithBuffer[string](producersGrp, consumersGrp, groupProducersBufferCap)
|
|
|
|
defer producers.Close()
|
2022-12-26 22:11:45 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
for i, source := range sources {
|
|
|
|
i, source := i, source
|
2021-10-13 21:40:18 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
producers.GoProduce(func(ctx context.Context, hostsChan chan<- string) error {
|
|
|
|
locInfo := fmt.Sprintf("item #%d of group %s", i, group)
|
2021-10-13 21:40:18 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
opener, err := NewSourceOpener(locInfo, source, b.downloader)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2023-03-07 01:32:41 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return b.parseFile(ctx, opener, hostsChan)
|
|
|
|
})
|
|
|
|
}
|
2023-02-17 08:57:27 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
hasEntries := false
|
|
|
|
|
|
|
|
producers.GoConsume(func(ctx context.Context, ch <-chan string) error {
|
|
|
|
for host := range ch {
|
2023-11-17 15:58:35 +01:00
|
|
|
if groupFactory.AddEntry(host) {
|
|
|
|
hasEntries = true
|
|
|
|
} else {
|
|
|
|
logger().WithField("host", host).Warn("no list cache was able to use host")
|
|
|
|
}
|
2020-05-10 19:48:31 +02:00
|
|
|
}
|
2021-10-13 21:40:18 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return nil
|
|
|
|
})
|
2023-03-07 01:32:41 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
err := producers.Wait()
|
|
|
|
if err != nil {
|
|
|
|
if !hasEntries {
|
|
|
|
// Always fail the group if no entries were parsed
|
|
|
|
return err
|
|
|
|
}
|
2021-10-13 21:40:18 +02:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
var transientErr *TransientError
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
if errors.As(err, &transientErr) {
|
|
|
|
// Temporary error: fail the whole group to retry later
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
groupFactory.Finish()
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
return nil
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2022-05-16 21:32:16 +02:00
|
|
|
// downloads file (or reads local file) and writes each line in the file to the result channel
|
2023-04-17 18:21:56 +02:00
|
|
|
func (b *ListCache) parseFile(ctx context.Context, opener SourceOpener, resultCh chan<- string) error {
|
2023-03-07 01:32:41 +01:00
|
|
|
count := 0
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
logger := func() *logrus.Entry {
|
|
|
|
return logger().WithFields(logrus.Fields{
|
2023-04-17 18:21:56 +02:00
|
|
|
"source": opener.String(),
|
2023-03-07 01:32:41 +01:00
|
|
|
"count": count,
|
|
|
|
})
|
|
|
|
}
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
logger().Debug("starting processing of source")
|
|
|
|
|
2023-11-29 18:18:29 +01:00
|
|
|
r, err := opener.Open(ctx)
|
2020-01-12 18:23:35 +01:00
|
|
|
if err != nil {
|
2023-03-07 01:32:41 +01:00
|
|
|
logger().Error("cannot open source: ", err)
|
2020-05-10 19:48:31 +02:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
return err
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
defer r.Close()
|
|
|
|
|
2023-04-17 18:21:56 +02:00
|
|
|
p := parsers.AllowErrors(parsers.Hosts(r), b.cfg.MaxErrorsPerSource)
|
2023-03-07 01:32:41 +01:00
|
|
|
p.OnErr(func(err error) {
|
|
|
|
logger().Warnf("parse error: %s, trying to continue", err)
|
|
|
|
})
|
|
|
|
|
|
|
|
err = parsers.ForEach[*parsers.HostsIterator](ctx, p, func(hosts *parsers.HostsIterator) error {
|
|
|
|
return hosts.ForEach(func(host string) error {
|
|
|
|
count++
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
// For IPs, we want to ensure the string is the Go representation so that when
|
|
|
|
// we compare responses, a same IP matches, even if it was written differently
|
|
|
|
// in the list.
|
|
|
|
if ip := net.ParseIP(host); ip != nil {
|
|
|
|
host = ip.String()
|
|
|
|
}
|
2020-01-12 18:23:35 +01:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
resultCh <- host
|
2020-03-06 23:00:14 +01:00
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
return nil
|
|
|
|
})
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
// Don't log cancelation: it was caused by another goroutine failing
|
|
|
|
if !errors.Is(err, context.Canceled) {
|
|
|
|
logger().Error("parse error: ", err)
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
2023-03-07 01:32:41 +01:00
|
|
|
|
|
|
|
// Only propagate the error if no entries were parsed
|
|
|
|
// If the file was partially parsed, we'll settle for that
|
|
|
|
|
|
|
|
if count == 0 {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
2020-01-12 18:23:35 +01:00
|
|
|
}
|
|
|
|
|
2023-03-07 01:32:41 +01:00
|
|
|
logger().Info("import succeeded")
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|