From e7ddab714bb38b3704eea285e9f9fa112412a831 Mon Sep 17 00:00:00 2001 From: Dimitri Herzog Date: Sat, 18 Sep 2021 22:37:02 +0200 Subject: [PATCH] regex support for matching (#12) --- README.md | 1 + docs/config.yml | 2 + docs/configuration.md | 14 ++- docs/includes/abbreviations.md | 3 +- docs/index.md | 1 + lists/caches.go | 182 +++++++++++++++++++++++++++++++++ lists/caches_test.go | 79 ++++++++++++++ lists/list_cache.go | 67 ++---------- lists/list_cache_test.go | 18 +++- 9 files changed, 305 insertions(+), 62 deletions(-) create mode 100644 lists/caches.go create mode 100644 lists/caches_test.go diff --git a/README.md b/README.md index 0890b8e6..db3bdf62 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Blocky is a DNS proxy and ad-blocker for the local network written in Go with fo * Definition of black and white lists per client group (Kids, Smart home devices, etc.) * Periodical reload of external black and white lists + * Regex support * Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists) - **Advanced DNS configuration** - not just an ad-blocker diff --git a/docs/config.yml b/docs/config.yml index a9136949..325607f6 100644 --- a/docs/config.yml +++ b/docs/config.yml @@ -53,6 +53,8 @@ blocking: # inline definition with YAML literal block scalar style # hosts format whitelistdomain.com + # this is a regex + /^banners?[_.-]/ # definition: which groups should be applied for which client clientGroupsBlock: # default will be used, if no special definition for a client name exists diff --git a/docs/configuration.md b/docs/configuration.md index 20414b94..9bb76862 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -176,7 +176,8 @@ contains a map of client name and multiple IP addresses. Blocky can download and use external lists with domains or IP addresses to block DNS query (e.g. advertisement, malware, trackers, adult sites). You can group several list sources together and define the blocking behavior per client. -External blacklists must be in the well-known [Hosts format](https://en.wikipedia.org/wiki/Hosts_(file)). +External blacklists must be either in the well-known [Hosts format](https://en.wikipedia.org/wiki/Hosts_(file)) or just +a plain domain list (one domain per line). Blocky also supports regex as more powerful tool to define patterns to block. Blocky uses [DNS sinkhole](https://en.wikipedia.org/wiki/DNS_sinkhole) approach to block a DNS query. Domain name from the request, IP address from the response, and the CNAME record will be checked against configured blacklists. @@ -200,6 +201,8 @@ in hosts format (YAML literal block scalar style). All Urls must be grouped to a # inline definition with YAML literal block scalar style someadsdomain.com anotheradsdomain.com + # this is a regex + /^banners?[_.-]/ special: - https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts whiteLists: @@ -218,6 +221,15 @@ in hosts format (YAML literal block scalar style). All Urls must be grouped to a If a group has **only** whitelist entries -> this means only domains from this list are allowed, all other domains will be blocked +#### Regex support + +You can use regex to define patterns to block. A regex entry must start and end with the slash character (/). Some +Examples: + +- `/baddomain/` will block `www.baddomain.com`, `baddomain.com`, but also `mybaddomain-sometext.com` +- `/^baddomain/` will block `baddomain.com`, but not `www.baddomain.com` +- `/^apple\.(de|com)$/` will only block `apple.de` and `apple.com` + ### Client groups In this configuration section, you can define, which blocking group(s) should be used for which client in your network. diff --git a/docs/includes/abbreviations.md b/docs/includes/abbreviations.md index a070267b..1dd4be24 100644 --- a/docs/includes/abbreviations.md +++ b/docs/includes/abbreviations.md @@ -22,4 +22,5 @@ *[CSV]: Comma-separated values *[SAMBA]: Server Message Block Protocol (Windows Network File System) *[DHCP]: Dynamic Host Configuration Protocol -*[duration format]: Example: "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us", "ms", "s", "m", "h". \ No newline at end of file +*[duration format]: Example: "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us", "ms", "s", "m", "h". +*[regex]: Regular expression \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index af9cb21c..4382ced9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,6 +12,7 @@ Blocky is a DNS proxy and ad-blocker for the local network written in Go with fo * Definition of black and white lists per client group (Kids, Smart home devices, etc.) * Periodical reload of external black and white lists + * Regex support * Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists) - **Advanced DNS configuration** - :nerd: not just an ad-blocker diff --git a/lists/caches.go b/lists/caches.go new file mode 100644 index 00000000..58138288 --- /dev/null +++ b/lists/caches.go @@ -0,0 +1,182 @@ +package lists + +import ( + "regexp" + "sort" + "strings" + + "github.com/0xERR0R/blocky/log" + + "github.com/0xERR0R/blocky/util" +) + +type cache interface { + elementCount() int + contains(searchString string) bool +} + +type cacheFactory interface { + addEntry(entry string) + create() cache +} + +type stringCache map[int]string + +func (cache stringCache) elementCount() int { + count := 0 + + for k, v := range cache { + count += len(v) / k + } + + return count +} + +func (cache stringCache) contains(searchString string) bool { + searchLen := len(searchString) + if searchLen == 0 { + return false + } + + searchBucketLen := len(cache[searchLen]) / searchLen + idx := sort.Search(searchBucketLen, func(i int) bool { + return cache[searchLen][i*searchLen:i*searchLen+searchLen] >= searchString + }) + + if idx < searchBucketLen { + return cache[searchLen][idx*searchLen:idx*searchLen+searchLen] == strings.ToLower(searchString) + } + + return false +} + +type stringCacheFactory struct { + cache stringCache + keys map[string]struct{} + tmp map[int]*strings.Builder +} + +func newStringCacheFactory() cacheFactory { + return &stringCacheFactory{ + cache: make(stringCache), + // temporary map to remove duplicates + keys: make(map[string]struct{}), + tmp: make(map[int]*strings.Builder), + } +} + +func (s *stringCacheFactory) addEntry(entry string) { + if _, value := s.keys[entry]; !value { + s.keys[entry] = struct{}{} + if s.tmp[len(entry)] == nil { + s.tmp[len(entry)] = &strings.Builder{} + } + + s.tmp[len(entry)].WriteString(entry) + } +} + +func (s *stringCacheFactory) create() cache { + for k, v := range s.tmp { + chunks := util.Chunks(v.String(), k) + sort.Strings(chunks) + + s.cache[k] = strings.Join(chunks, "") + + v.Reset() + } + + return s.cache +} + +type regexCache []*regexp.Regexp + +func (cache regexCache) elementCount() int { + return len(cache) +} + +func (cache regexCache) contains(searchString string) bool { + for _, regex := range cache { + if regex.MatchString(searchString) { + log.PrefixedLog("regexCache").Debugf("regex '%s' matched with '%s'", regex, searchString) + return true + } + } + + return false +} + +type regexCacheFactory struct { + cache regexCache +} + +func (r *regexCacheFactory) addEntry(entry string) { + compile, err := regexp.Compile(entry) + if err != nil { + log.Log().Warnf("invalid regex '%s'", entry) + } else { + r.cache = append(r.cache, compile) + } +} + +func (r *regexCacheFactory) create() cache { + return r.cache +} + +func newRegexCacheFactory() cacheFactory { + return ®exCacheFactory{ + cache: make(regexCache, 0), + } +} + +type chainedCache struct { + caches []cache +} + +func (cache chainedCache) elementCount() int { + sum := 0 + for _, c := range cache.caches { + sum += c.elementCount() + } + + return sum +} + +func (cache chainedCache) contains(searchString string) bool { + for _, c := range cache.caches { + if c.contains(searchString) { + return true + } + } + + return false +} + +type chainedCacheFactory struct { + stringCacheFactory cacheFactory + regexCacheFactory cacheFactory +} + +var regexPattern = regexp.MustCompile("^/.*/$") + +func (r *chainedCacheFactory) addEntry(entry string) { + if regexPattern.MatchString(entry) { + entry = strings.TrimSpace(strings.Trim(entry, "/")) + r.regexCacheFactory.addEntry(entry) + } else { + r.stringCacheFactory.addEntry(entry) + } +} + +func (r *chainedCacheFactory) create() cache { + return &chainedCache{ + caches: []cache{r.stringCacheFactory.create(), r.regexCacheFactory.create()}, + } +} + +func newChainedCacheFactory() cacheFactory { + return &chainedCacheFactory{ + stringCacheFactory: newStringCacheFactory(), + regexCacheFactory: newRegexCacheFactory(), + } +} diff --git a/lists/caches_test.go b/lists/caches_test.go new file mode 100644 index 00000000..9aa22d46 --- /dev/null +++ b/lists/caches_test.go @@ -0,0 +1,79 @@ +package lists + +import ( + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +var _ = Describe("Caches", func() { + Describe("String cache", func() { + When("string cache was created", func() { + factory := newStringCacheFactory() + factory.addEntry("google.com") + factory.addEntry("apple.com") + cache := factory.create() + It("should match if cache contains string", func() { + Expect(cache.contains("apple.com")).Should(BeTrue()) + Expect(cache.contains("google.com")).Should(BeTrue()) + Expect(cache.contains("www.google.com")).Should(BeFalse()) + }) + It("should return correct element count", func() { + Expect(cache.elementCount()).Should(Equal(2)) + }) + }) + }) + + Describe("Regex cache", func() { + When("regex cache was created", func() { + factory := newRegexCacheFactory() + factory.addEntry(".*google.com") + factory.addEntry("^apple\\.(de|com)$") + factory.addEntry("amazon") + // this is not a regex, will be ignored + factory.addEntry("(wrongRegex") + cache := factory.create() + It("should match if one regex in cache matches string", func() { + Expect(cache.contains("google.com")).Should(BeTrue()) + Expect(cache.contains("google.coma")).Should(BeTrue()) + Expect(cache.contains("agoogle.com")).Should(BeTrue()) + Expect(cache.contains("www.google.com")).Should(BeTrue()) + Expect(cache.contains("apple.com")).Should(BeTrue()) + Expect(cache.contains("apple.de")).Should(BeTrue()) + Expect(cache.contains("apple.it")).Should(BeFalse()) + Expect(cache.contains("www.apple.com")).Should(BeFalse()) + Expect(cache.contains("applecom")).Should(BeFalse()) + Expect(cache.contains("www.amazon.com")).Should(BeTrue()) + Expect(cache.contains("amazon.com")).Should(BeTrue()) + Expect(cache.contains("myamazon.com")).Should(BeTrue()) + }) + It("should return correct element count", func() { + Expect(cache.elementCount()).Should(Equal(3)) + }) + }) + }) + + Describe("Chained cache", func() { + When("chained cache was created", func() { + factory := newChainedCacheFactory() + factory.addEntry("/.*google.com/") + factory.addEntry("/^apple\\.(de|com)$/") + factory.addEntry("amazon.com") + cache := factory.create() + It("should match if one regex in cache matches string", func() { + Expect(cache.contains("google.com")).Should(BeTrue()) + Expect(cache.contains("google.coma")).Should(BeTrue()) + Expect(cache.contains("agoogle.com")).Should(BeTrue()) + Expect(cache.contains("www.google.com")).Should(BeTrue()) + Expect(cache.contains("apple.com")).Should(BeTrue()) + Expect(cache.contains("amazon.com")).Should(BeTrue()) + Expect(cache.contains("apple.de")).Should(BeTrue()) + Expect(cache.contains("www.apple.com")).Should(BeFalse()) + Expect(cache.contains("applecom")).Should(BeFalse()) + }) + It("should return correct element count", func() { + Expect(cache.elementCount()).Should(Equal(3)) + }) + }) + }) + +}) diff --git a/lists/list_cache.go b/lists/list_cache.go index 78d667f1..bb6e78f5 100644 --- a/lists/list_cache.go +++ b/lists/list_cache.go @@ -9,15 +9,12 @@ import ( "net" "net/http" "os" - "sort" "strings" "sync" "time" "github.com/0xERR0R/blocky/evt" "github.com/0xERR0R/blocky/log" - "github.com/0xERR0R/blocky/util" - "github.com/sirupsen/logrus" ) @@ -32,36 +29,6 @@ const ( // ) type ListCacheType int -type stringCache map[int]string - -func (cache stringCache) elementCount() int { - count := 0 - - for k, v := range cache { - count += len(v) / k - } - - return count -} - -func (cache stringCache) contains(searchString string) bool { - searchLen := len(searchString) - if searchLen == 0 { - return false - } - - searchBucketLen := len(cache[searchLen]) / searchLen - idx := sort.Search(searchBucketLen, func(i int) bool { - return cache[searchLen][i*searchLen:i*searchLen+searchLen] >= searchString - }) - - if idx < searchBucketLen { - return cache[searchLen][idx*searchLen:idx*searchLen+searchLen] == strings.ToLower(searchString) - } - - return false -} - // Matcher checks if a domain is in a list type Matcher interface { // Match matches passed domain name against cached list entries @@ -73,7 +40,7 @@ type Matcher interface { // ListCache generic cache of strings divided in groups type ListCache struct { - groupCaches map[string]stringCache + groupCaches map[string]cache lock sync.RWMutex groupToLinks map[string][]string @@ -115,7 +82,7 @@ func (b *ListCache) Configuration() (result []string) { // NewListCache creates new list instance func NewListCache(t ListCacheType, groupToLinks map[string][]string, refreshPeriod time.Duration, downloadTimeout time.Duration) *ListCache { - groupCaches := make(map[string]stringCache) + groupCaches := make(map[string]cache) timeout := downloadTimeout if downloadTimeout == 0 { @@ -159,15 +126,12 @@ func logger() *logrus.Entry { } // downloads and reads files with domain names and creates cache for them -func (b *ListCache) createCacheForGroup(links []string) stringCache { - cache := make(stringCache) - - keys := make(map[string]struct{}) - +func (b *ListCache) createCacheForGroup(links []string) cache { var wg sync.WaitGroup c := make(chan []string, len(links)) + // loop over links (http/local) or inline definitions for _, link := range links { wg.Add(1) @@ -176,7 +140,7 @@ func (b *ListCache) createCacheForGroup(links []string) stringCache { wg.Wait() - tmp := make(map[int]*strings.Builder) + factory := newChainedCacheFactory() Loop: for { @@ -186,13 +150,7 @@ Loop: return nil } for _, entry := range res { - if _, value := keys[entry]; !value { - keys[entry] = struct{}{} - if tmp[len(entry)] == nil { - tmp[len(entry)] = &strings.Builder{} - } - tmp[len(entry)].WriteString(entry) - } + factory.addEntry(entry) } default: close(c) @@ -200,16 +158,7 @@ Loop: } } - for k, v := range tmp { - chunks := util.Chunks(v.String(), k) - sort.Strings(chunks) - - cache[k] = strings.Join(chunks, "") - - v.Reset() - } - - return cache + return factory.create() } // Match matches passed domain name against cached list entries @@ -218,7 +167,7 @@ func (b *ListCache) Match(domain string, groupsToCheck []string) (found bool, gr defer b.lock.RUnlock() for _, g := range groupsToCheck { - if b.groupCaches[g].contains(domain) { + if c, ok := b.groupCaches[g]; ok && c.contains(domain) { return true, g } } diff --git a/lists/list_cache_test.go b/lists/list_cache_test.go index c1b24f21..1f4bbb41 100644 --- a/lists/list_cache_test.go +++ b/lists/list_cache_test.go @@ -19,7 +19,6 @@ var _ = Describe("ListCache", func() { emptyFile, file1, file2, file3 *os.File server1, server2, server3 *httptest.Server ) - BeforeEach(func() { emptyFile = TempFile("#empty file\n\n") server1 = TestServer("blocked1.com\nblocked1a.com\n192.168.178.55") @@ -259,6 +258,23 @@ var _ = Describe("ListCache", func() { Expect(group).Should(Equal("gr1")) }) }) + When("inline regex content is defined", func() { + It("should match", func() { + lists := map[string][]string{ + "gr1": {"/^apple\\.(de|com)$/\n"}, + } + + sut := NewListCache(ListCacheTypeBlacklist, lists, 0, 0) + + found, group := sut.Match("apple.com", []string{"gr1"}) + Expect(found).Should(BeTrue()) + Expect(group).Should(Equal("gr1")) + + found, group = sut.Match("apple.de", []string{"gr1"}) + Expect(found).Should(BeTrue()) + Expect(group).Should(Equal("gr1")) + }) + }) }) Describe("Configuration", func() { When("refresh is enabled", func() {