regex support for matching (#12)

This commit is contained in:
Dimitri Herzog 2021-09-18 22:37:02 +02:00
parent 925c6f97eb
commit e7ddab714b
9 changed files with 305 additions and 62 deletions

View File

@ -25,6 +25,7 @@ Blocky is a DNS proxy and ad-blocker for the local network written in Go with fo
* Definition of black and white lists per client group (Kids, Smart home devices, etc.)
* Periodical reload of external black and white lists
* Regex support
* Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists)
- **Advanced DNS configuration** - not just an ad-blocker

View File

@ -53,6 +53,8 @@ blocking:
# inline definition with YAML literal block scalar style
# hosts format
whitelistdomain.com
# this is a regex
/^banners?[_.-]/
# definition: which groups should be applied for which client
clientGroupsBlock:
# default will be used, if no special definition for a client name exists

View File

@ -176,7 +176,8 @@ contains a map of client name and multiple IP addresses.
Blocky can download and use external lists with domains or IP addresses to block DNS query (e.g. advertisement, malware,
trackers, adult sites). You can group several list sources together and define the blocking behavior per client.
External blacklists must be in the well-known [Hosts format](https://en.wikipedia.org/wiki/Hosts_(file)).
External blacklists must be either in the well-known [Hosts format](https://en.wikipedia.org/wiki/Hosts_(file)) or just
a plain domain list (one domain per line). Blocky also supports regex as more powerful tool to define patterns to block.
Blocky uses [DNS sinkhole](https://en.wikipedia.org/wiki/DNS_sinkhole) approach to block a DNS query. Domain name from
the request, IP address from the response, and the CNAME record will be checked against configured blacklists.
@ -200,6 +201,8 @@ in hosts format (YAML literal block scalar style). All Urls must be grouped to a
# inline definition with YAML literal block scalar style
someadsdomain.com
anotheradsdomain.com
# this is a regex
/^banners?[_.-]/
special:
- https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts
whiteLists:
@ -218,6 +221,15 @@ in hosts format (YAML literal block scalar style). All Urls must be grouped to a
If a group has **only** whitelist entries -> this means only domains from this list are allowed, all other domains will
be blocked
#### Regex support
You can use regex to define patterns to block. A regex entry must start and end with the slash character (/). Some
Examples:
- `/baddomain/` will block `www.baddomain.com`, `baddomain.com`, but also `mybaddomain-sometext.com`
- `/^baddomain/` will block `baddomain.com`, but not `www.baddomain.com`
- `/^apple\.(de|com)$/` will only block `apple.de` and `apple.com`
### Client groups
In this configuration section, you can define, which blocking group(s) should be used for which client in your network.

View File

@ -22,4 +22,5 @@
*[CSV]: Comma-separated values
*[SAMBA]: Server Message Block Protocol (Windows Network File System)
*[DHCP]: Dynamic Host Configuration Protocol
*[duration format]: Example: "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us", "ms", "s", "m", "h".
*[duration format]: Example: "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us", "ms", "s", "m", "h".
*[regex]: Regular expression

View File

@ -12,6 +12,7 @@ Blocky is a DNS proxy and ad-blocker for the local network written in Go with fo
* Definition of black and white lists per client group (Kids, Smart home devices, etc.)
* Periodical reload of external black and white lists
* Regex support
* Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists)
- **Advanced DNS configuration** - :nerd: not just an ad-blocker

182
lists/caches.go Normal file
View File

@ -0,0 +1,182 @@
package lists
import (
"regexp"
"sort"
"strings"
"github.com/0xERR0R/blocky/log"
"github.com/0xERR0R/blocky/util"
)
type cache interface {
elementCount() int
contains(searchString string) bool
}
type cacheFactory interface {
addEntry(entry string)
create() cache
}
type stringCache map[int]string
func (cache stringCache) elementCount() int {
count := 0
for k, v := range cache {
count += len(v) / k
}
return count
}
func (cache stringCache) contains(searchString string) bool {
searchLen := len(searchString)
if searchLen == 0 {
return false
}
searchBucketLen := len(cache[searchLen]) / searchLen
idx := sort.Search(searchBucketLen, func(i int) bool {
return cache[searchLen][i*searchLen:i*searchLen+searchLen] >= searchString
})
if idx < searchBucketLen {
return cache[searchLen][idx*searchLen:idx*searchLen+searchLen] == strings.ToLower(searchString)
}
return false
}
type stringCacheFactory struct {
cache stringCache
keys map[string]struct{}
tmp map[int]*strings.Builder
}
func newStringCacheFactory() cacheFactory {
return &stringCacheFactory{
cache: make(stringCache),
// temporary map to remove duplicates
keys: make(map[string]struct{}),
tmp: make(map[int]*strings.Builder),
}
}
func (s *stringCacheFactory) addEntry(entry string) {
if _, value := s.keys[entry]; !value {
s.keys[entry] = struct{}{}
if s.tmp[len(entry)] == nil {
s.tmp[len(entry)] = &strings.Builder{}
}
s.tmp[len(entry)].WriteString(entry)
}
}
func (s *stringCacheFactory) create() cache {
for k, v := range s.tmp {
chunks := util.Chunks(v.String(), k)
sort.Strings(chunks)
s.cache[k] = strings.Join(chunks, "")
v.Reset()
}
return s.cache
}
type regexCache []*regexp.Regexp
func (cache regexCache) elementCount() int {
return len(cache)
}
func (cache regexCache) contains(searchString string) bool {
for _, regex := range cache {
if regex.MatchString(searchString) {
log.PrefixedLog("regexCache").Debugf("regex '%s' matched with '%s'", regex, searchString)
return true
}
}
return false
}
type regexCacheFactory struct {
cache regexCache
}
func (r *regexCacheFactory) addEntry(entry string) {
compile, err := regexp.Compile(entry)
if err != nil {
log.Log().Warnf("invalid regex '%s'", entry)
} else {
r.cache = append(r.cache, compile)
}
}
func (r *regexCacheFactory) create() cache {
return r.cache
}
func newRegexCacheFactory() cacheFactory {
return &regexCacheFactory{
cache: make(regexCache, 0),
}
}
type chainedCache struct {
caches []cache
}
func (cache chainedCache) elementCount() int {
sum := 0
for _, c := range cache.caches {
sum += c.elementCount()
}
return sum
}
func (cache chainedCache) contains(searchString string) bool {
for _, c := range cache.caches {
if c.contains(searchString) {
return true
}
}
return false
}
type chainedCacheFactory struct {
stringCacheFactory cacheFactory
regexCacheFactory cacheFactory
}
var regexPattern = regexp.MustCompile("^/.*/$")
func (r *chainedCacheFactory) addEntry(entry string) {
if regexPattern.MatchString(entry) {
entry = strings.TrimSpace(strings.Trim(entry, "/"))
r.regexCacheFactory.addEntry(entry)
} else {
r.stringCacheFactory.addEntry(entry)
}
}
func (r *chainedCacheFactory) create() cache {
return &chainedCache{
caches: []cache{r.stringCacheFactory.create(), r.regexCacheFactory.create()},
}
}
func newChainedCacheFactory() cacheFactory {
return &chainedCacheFactory{
stringCacheFactory: newStringCacheFactory(),
regexCacheFactory: newRegexCacheFactory(),
}
}

79
lists/caches_test.go Normal file
View File

@ -0,0 +1,79 @@
package lists
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
var _ = Describe("Caches", func() {
Describe("String cache", func() {
When("string cache was created", func() {
factory := newStringCacheFactory()
factory.addEntry("google.com")
factory.addEntry("apple.com")
cache := factory.create()
It("should match if cache contains string", func() {
Expect(cache.contains("apple.com")).Should(BeTrue())
Expect(cache.contains("google.com")).Should(BeTrue())
Expect(cache.contains("www.google.com")).Should(BeFalse())
})
It("should return correct element count", func() {
Expect(cache.elementCount()).Should(Equal(2))
})
})
})
Describe("Regex cache", func() {
When("regex cache was created", func() {
factory := newRegexCacheFactory()
factory.addEntry(".*google.com")
factory.addEntry("^apple\\.(de|com)$")
factory.addEntry("amazon")
// this is not a regex, will be ignored
factory.addEntry("(wrongRegex")
cache := factory.create()
It("should match if one regex in cache matches string", func() {
Expect(cache.contains("google.com")).Should(BeTrue())
Expect(cache.contains("google.coma")).Should(BeTrue())
Expect(cache.contains("agoogle.com")).Should(BeTrue())
Expect(cache.contains("www.google.com")).Should(BeTrue())
Expect(cache.contains("apple.com")).Should(BeTrue())
Expect(cache.contains("apple.de")).Should(BeTrue())
Expect(cache.contains("apple.it")).Should(BeFalse())
Expect(cache.contains("www.apple.com")).Should(BeFalse())
Expect(cache.contains("applecom")).Should(BeFalse())
Expect(cache.contains("www.amazon.com")).Should(BeTrue())
Expect(cache.contains("amazon.com")).Should(BeTrue())
Expect(cache.contains("myamazon.com")).Should(BeTrue())
})
It("should return correct element count", func() {
Expect(cache.elementCount()).Should(Equal(3))
})
})
})
Describe("Chained cache", func() {
When("chained cache was created", func() {
factory := newChainedCacheFactory()
factory.addEntry("/.*google.com/")
factory.addEntry("/^apple\\.(de|com)$/")
factory.addEntry("amazon.com")
cache := factory.create()
It("should match if one regex in cache matches string", func() {
Expect(cache.contains("google.com")).Should(BeTrue())
Expect(cache.contains("google.coma")).Should(BeTrue())
Expect(cache.contains("agoogle.com")).Should(BeTrue())
Expect(cache.contains("www.google.com")).Should(BeTrue())
Expect(cache.contains("apple.com")).Should(BeTrue())
Expect(cache.contains("amazon.com")).Should(BeTrue())
Expect(cache.contains("apple.de")).Should(BeTrue())
Expect(cache.contains("www.apple.com")).Should(BeFalse())
Expect(cache.contains("applecom")).Should(BeFalse())
})
It("should return correct element count", func() {
Expect(cache.elementCount()).Should(Equal(3))
})
})
})
})

View File

@ -9,15 +9,12 @@ import (
"net"
"net/http"
"os"
"sort"
"strings"
"sync"
"time"
"github.com/0xERR0R/blocky/evt"
"github.com/0xERR0R/blocky/log"
"github.com/0xERR0R/blocky/util"
"github.com/sirupsen/logrus"
)
@ -32,36 +29,6 @@ const (
// )
type ListCacheType int
type stringCache map[int]string
func (cache stringCache) elementCount() int {
count := 0
for k, v := range cache {
count += len(v) / k
}
return count
}
func (cache stringCache) contains(searchString string) bool {
searchLen := len(searchString)
if searchLen == 0 {
return false
}
searchBucketLen := len(cache[searchLen]) / searchLen
idx := sort.Search(searchBucketLen, func(i int) bool {
return cache[searchLen][i*searchLen:i*searchLen+searchLen] >= searchString
})
if idx < searchBucketLen {
return cache[searchLen][idx*searchLen:idx*searchLen+searchLen] == strings.ToLower(searchString)
}
return false
}
// Matcher checks if a domain is in a list
type Matcher interface {
// Match matches passed domain name against cached list entries
@ -73,7 +40,7 @@ type Matcher interface {
// ListCache generic cache of strings divided in groups
type ListCache struct {
groupCaches map[string]stringCache
groupCaches map[string]cache
lock sync.RWMutex
groupToLinks map[string][]string
@ -115,7 +82,7 @@ func (b *ListCache) Configuration() (result []string) {
// NewListCache creates new list instance
func NewListCache(t ListCacheType, groupToLinks map[string][]string, refreshPeriod time.Duration,
downloadTimeout time.Duration) *ListCache {
groupCaches := make(map[string]stringCache)
groupCaches := make(map[string]cache)
timeout := downloadTimeout
if downloadTimeout == 0 {
@ -159,15 +126,12 @@ func logger() *logrus.Entry {
}
// downloads and reads files with domain names and creates cache for them
func (b *ListCache) createCacheForGroup(links []string) stringCache {
cache := make(stringCache)
keys := make(map[string]struct{})
func (b *ListCache) createCacheForGroup(links []string) cache {
var wg sync.WaitGroup
c := make(chan []string, len(links))
// loop over links (http/local) or inline definitions
for _, link := range links {
wg.Add(1)
@ -176,7 +140,7 @@ func (b *ListCache) createCacheForGroup(links []string) stringCache {
wg.Wait()
tmp := make(map[int]*strings.Builder)
factory := newChainedCacheFactory()
Loop:
for {
@ -186,13 +150,7 @@ Loop:
return nil
}
for _, entry := range res {
if _, value := keys[entry]; !value {
keys[entry] = struct{}{}
if tmp[len(entry)] == nil {
tmp[len(entry)] = &strings.Builder{}
}
tmp[len(entry)].WriteString(entry)
}
factory.addEntry(entry)
}
default:
close(c)
@ -200,16 +158,7 @@ Loop:
}
}
for k, v := range tmp {
chunks := util.Chunks(v.String(), k)
sort.Strings(chunks)
cache[k] = strings.Join(chunks, "")
v.Reset()
}
return cache
return factory.create()
}
// Match matches passed domain name against cached list entries
@ -218,7 +167,7 @@ func (b *ListCache) Match(domain string, groupsToCheck []string) (found bool, gr
defer b.lock.RUnlock()
for _, g := range groupsToCheck {
if b.groupCaches[g].contains(domain) {
if c, ok := b.groupCaches[g]; ok && c.contains(domain) {
return true, g
}
}

View File

@ -19,7 +19,6 @@ var _ = Describe("ListCache", func() {
emptyFile, file1, file2, file3 *os.File
server1, server2, server3 *httptest.Server
)
BeforeEach(func() {
emptyFile = TempFile("#empty file\n\n")
server1 = TestServer("blocked1.com\nblocked1a.com\n192.168.178.55")
@ -259,6 +258,23 @@ var _ = Describe("ListCache", func() {
Expect(group).Should(Equal("gr1"))
})
})
When("inline regex content is defined", func() {
It("should match", func() {
lists := map[string][]string{
"gr1": {"/^apple\\.(de|com)$/\n"},
}
sut := NewListCache(ListCacheTypeBlacklist, lists, 0, 0)
found, group := sut.Match("apple.com", []string{"gr1"})
Expect(found).Should(BeTrue())
Expect(group).Should(Equal("gr1"))
found, group = sut.Match("apple.de", []string{"gr1"})
Expect(found).Should(BeTrue())
Expect(group).Should(Equal("gr1"))
})
})
})
Describe("Configuration", func() {
When("refresh is enabled", func() {