regex support for matching (#12)

This commit is contained in:
Dimitri Herzog 2021-09-18 22:37:02 +02:00
parent 925c6f97eb
commit e7ddab714b
9 changed files with 305 additions and 62 deletions

View File

@ -25,6 +25,7 @@ Blocky is a DNS proxy and ad-blocker for the local network written in Go with fo
* Definition of black and white lists per client group (Kids, Smart home devices, etc.) * Definition of black and white lists per client group (Kids, Smart home devices, etc.)
* Periodical reload of external black and white lists * Periodical reload of external black and white lists
* Regex support
* Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists) * Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists)
- **Advanced DNS configuration** - not just an ad-blocker - **Advanced DNS configuration** - not just an ad-blocker

View File

@ -53,6 +53,8 @@ blocking:
# inline definition with YAML literal block scalar style # inline definition with YAML literal block scalar style
# hosts format # hosts format
whitelistdomain.com whitelistdomain.com
# this is a regex
/^banners?[_.-]/
# definition: which groups should be applied for which client # definition: which groups should be applied for which client
clientGroupsBlock: clientGroupsBlock:
# default will be used, if no special definition for a client name exists # default will be used, if no special definition for a client name exists

View File

@ -176,7 +176,8 @@ contains a map of client name and multiple IP addresses.
Blocky can download and use external lists with domains or IP addresses to block DNS query (e.g. advertisement, malware, Blocky can download and use external lists with domains or IP addresses to block DNS query (e.g. advertisement, malware,
trackers, adult sites). You can group several list sources together and define the blocking behavior per client. trackers, adult sites). You can group several list sources together and define the blocking behavior per client.
External blacklists must be in the well-known [Hosts format](https://en.wikipedia.org/wiki/Hosts_(file)). External blacklists must be either in the well-known [Hosts format](https://en.wikipedia.org/wiki/Hosts_(file)) or just
a plain domain list (one domain per line). Blocky also supports regex as more powerful tool to define patterns to block.
Blocky uses [DNS sinkhole](https://en.wikipedia.org/wiki/DNS_sinkhole) approach to block a DNS query. Domain name from Blocky uses [DNS sinkhole](https://en.wikipedia.org/wiki/DNS_sinkhole) approach to block a DNS query. Domain name from
the request, IP address from the response, and the CNAME record will be checked against configured blacklists. the request, IP address from the response, and the CNAME record will be checked against configured blacklists.
@ -200,6 +201,8 @@ in hosts format (YAML literal block scalar style). All Urls must be grouped to a
# inline definition with YAML literal block scalar style # inline definition with YAML literal block scalar style
someadsdomain.com someadsdomain.com
anotheradsdomain.com anotheradsdomain.com
# this is a regex
/^banners?[_.-]/
special: special:
- https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts - https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts
whiteLists: whiteLists:
@ -218,6 +221,15 @@ in hosts format (YAML literal block scalar style). All Urls must be grouped to a
If a group has **only** whitelist entries -> this means only domains from this list are allowed, all other domains will If a group has **only** whitelist entries -> this means only domains from this list are allowed, all other domains will
be blocked be blocked
#### Regex support
You can use regex to define patterns to block. A regex entry must start and end with the slash character (/). Some
Examples:
- `/baddomain/` will block `www.baddomain.com`, `baddomain.com`, but also `mybaddomain-sometext.com`
- `/^baddomain/` will block `baddomain.com`, but not `www.baddomain.com`
- `/^apple\.(de|com)$/` will only block `apple.de` and `apple.com`
### Client groups ### Client groups
In this configuration section, you can define, which blocking group(s) should be used for which client in your network. In this configuration section, you can define, which blocking group(s) should be used for which client in your network.

View File

@ -22,4 +22,5 @@
*[CSV]: Comma-separated values *[CSV]: Comma-separated values
*[SAMBA]: Server Message Block Protocol (Windows Network File System) *[SAMBA]: Server Message Block Protocol (Windows Network File System)
*[DHCP]: Dynamic Host Configuration Protocol *[DHCP]: Dynamic Host Configuration Protocol
*[duration format]: Example: "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us", "ms", "s", "m", "h". *[duration format]: Example: "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us", "ms", "s", "m", "h".
*[regex]: Regular expression

View File

@ -12,6 +12,7 @@ Blocky is a DNS proxy and ad-blocker for the local network written in Go with fo
* Definition of black and white lists per client group (Kids, Smart home devices, etc.) * Definition of black and white lists per client group (Kids, Smart home devices, etc.)
* Periodical reload of external black and white lists * Periodical reload of external black and white lists
* Regex support
* Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists) * Blocking of request domain, response CNAME (deep CNAME inspection) and response IP addresses (against IP lists)
- **Advanced DNS configuration** - :nerd: not just an ad-blocker - **Advanced DNS configuration** - :nerd: not just an ad-blocker

182
lists/caches.go Normal file
View File

@ -0,0 +1,182 @@
package lists
import (
"regexp"
"sort"
"strings"
"github.com/0xERR0R/blocky/log"
"github.com/0xERR0R/blocky/util"
)
type cache interface {
elementCount() int
contains(searchString string) bool
}
type cacheFactory interface {
addEntry(entry string)
create() cache
}
type stringCache map[int]string
func (cache stringCache) elementCount() int {
count := 0
for k, v := range cache {
count += len(v) / k
}
return count
}
func (cache stringCache) contains(searchString string) bool {
searchLen := len(searchString)
if searchLen == 0 {
return false
}
searchBucketLen := len(cache[searchLen]) / searchLen
idx := sort.Search(searchBucketLen, func(i int) bool {
return cache[searchLen][i*searchLen:i*searchLen+searchLen] >= searchString
})
if idx < searchBucketLen {
return cache[searchLen][idx*searchLen:idx*searchLen+searchLen] == strings.ToLower(searchString)
}
return false
}
type stringCacheFactory struct {
cache stringCache
keys map[string]struct{}
tmp map[int]*strings.Builder
}
func newStringCacheFactory() cacheFactory {
return &stringCacheFactory{
cache: make(stringCache),
// temporary map to remove duplicates
keys: make(map[string]struct{}),
tmp: make(map[int]*strings.Builder),
}
}
func (s *stringCacheFactory) addEntry(entry string) {
if _, value := s.keys[entry]; !value {
s.keys[entry] = struct{}{}
if s.tmp[len(entry)] == nil {
s.tmp[len(entry)] = &strings.Builder{}
}
s.tmp[len(entry)].WriteString(entry)
}
}
func (s *stringCacheFactory) create() cache {
for k, v := range s.tmp {
chunks := util.Chunks(v.String(), k)
sort.Strings(chunks)
s.cache[k] = strings.Join(chunks, "")
v.Reset()
}
return s.cache
}
type regexCache []*regexp.Regexp
func (cache regexCache) elementCount() int {
return len(cache)
}
func (cache regexCache) contains(searchString string) bool {
for _, regex := range cache {
if regex.MatchString(searchString) {
log.PrefixedLog("regexCache").Debugf("regex '%s' matched with '%s'", regex, searchString)
return true
}
}
return false
}
type regexCacheFactory struct {
cache regexCache
}
func (r *regexCacheFactory) addEntry(entry string) {
compile, err := regexp.Compile(entry)
if err != nil {
log.Log().Warnf("invalid regex '%s'", entry)
} else {
r.cache = append(r.cache, compile)
}
}
func (r *regexCacheFactory) create() cache {
return r.cache
}
func newRegexCacheFactory() cacheFactory {
return &regexCacheFactory{
cache: make(regexCache, 0),
}
}
type chainedCache struct {
caches []cache
}
func (cache chainedCache) elementCount() int {
sum := 0
for _, c := range cache.caches {
sum += c.elementCount()
}
return sum
}
func (cache chainedCache) contains(searchString string) bool {
for _, c := range cache.caches {
if c.contains(searchString) {
return true
}
}
return false
}
type chainedCacheFactory struct {
stringCacheFactory cacheFactory
regexCacheFactory cacheFactory
}
var regexPattern = regexp.MustCompile("^/.*/$")
func (r *chainedCacheFactory) addEntry(entry string) {
if regexPattern.MatchString(entry) {
entry = strings.TrimSpace(strings.Trim(entry, "/"))
r.regexCacheFactory.addEntry(entry)
} else {
r.stringCacheFactory.addEntry(entry)
}
}
func (r *chainedCacheFactory) create() cache {
return &chainedCache{
caches: []cache{r.stringCacheFactory.create(), r.regexCacheFactory.create()},
}
}
func newChainedCacheFactory() cacheFactory {
return &chainedCacheFactory{
stringCacheFactory: newStringCacheFactory(),
regexCacheFactory: newRegexCacheFactory(),
}
}

79
lists/caches_test.go Normal file
View File

@ -0,0 +1,79 @@
package lists
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
var _ = Describe("Caches", func() {
Describe("String cache", func() {
When("string cache was created", func() {
factory := newStringCacheFactory()
factory.addEntry("google.com")
factory.addEntry("apple.com")
cache := factory.create()
It("should match if cache contains string", func() {
Expect(cache.contains("apple.com")).Should(BeTrue())
Expect(cache.contains("google.com")).Should(BeTrue())
Expect(cache.contains("www.google.com")).Should(BeFalse())
})
It("should return correct element count", func() {
Expect(cache.elementCount()).Should(Equal(2))
})
})
})
Describe("Regex cache", func() {
When("regex cache was created", func() {
factory := newRegexCacheFactory()
factory.addEntry(".*google.com")
factory.addEntry("^apple\\.(de|com)$")
factory.addEntry("amazon")
// this is not a regex, will be ignored
factory.addEntry("(wrongRegex")
cache := factory.create()
It("should match if one regex in cache matches string", func() {
Expect(cache.contains("google.com")).Should(BeTrue())
Expect(cache.contains("google.coma")).Should(BeTrue())
Expect(cache.contains("agoogle.com")).Should(BeTrue())
Expect(cache.contains("www.google.com")).Should(BeTrue())
Expect(cache.contains("apple.com")).Should(BeTrue())
Expect(cache.contains("apple.de")).Should(BeTrue())
Expect(cache.contains("apple.it")).Should(BeFalse())
Expect(cache.contains("www.apple.com")).Should(BeFalse())
Expect(cache.contains("applecom")).Should(BeFalse())
Expect(cache.contains("www.amazon.com")).Should(BeTrue())
Expect(cache.contains("amazon.com")).Should(BeTrue())
Expect(cache.contains("myamazon.com")).Should(BeTrue())
})
It("should return correct element count", func() {
Expect(cache.elementCount()).Should(Equal(3))
})
})
})
Describe("Chained cache", func() {
When("chained cache was created", func() {
factory := newChainedCacheFactory()
factory.addEntry("/.*google.com/")
factory.addEntry("/^apple\\.(de|com)$/")
factory.addEntry("amazon.com")
cache := factory.create()
It("should match if one regex in cache matches string", func() {
Expect(cache.contains("google.com")).Should(BeTrue())
Expect(cache.contains("google.coma")).Should(BeTrue())
Expect(cache.contains("agoogle.com")).Should(BeTrue())
Expect(cache.contains("www.google.com")).Should(BeTrue())
Expect(cache.contains("apple.com")).Should(BeTrue())
Expect(cache.contains("amazon.com")).Should(BeTrue())
Expect(cache.contains("apple.de")).Should(BeTrue())
Expect(cache.contains("www.apple.com")).Should(BeFalse())
Expect(cache.contains("applecom")).Should(BeFalse())
})
It("should return correct element count", func() {
Expect(cache.elementCount()).Should(Equal(3))
})
})
})
})

View File

@ -9,15 +9,12 @@ import (
"net" "net"
"net/http" "net/http"
"os" "os"
"sort"
"strings" "strings"
"sync" "sync"
"time" "time"
"github.com/0xERR0R/blocky/evt" "github.com/0xERR0R/blocky/evt"
"github.com/0xERR0R/blocky/log" "github.com/0xERR0R/blocky/log"
"github.com/0xERR0R/blocky/util"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
) )
@ -32,36 +29,6 @@ const (
// ) // )
type ListCacheType int type ListCacheType int
type stringCache map[int]string
func (cache stringCache) elementCount() int {
count := 0
for k, v := range cache {
count += len(v) / k
}
return count
}
func (cache stringCache) contains(searchString string) bool {
searchLen := len(searchString)
if searchLen == 0 {
return false
}
searchBucketLen := len(cache[searchLen]) / searchLen
idx := sort.Search(searchBucketLen, func(i int) bool {
return cache[searchLen][i*searchLen:i*searchLen+searchLen] >= searchString
})
if idx < searchBucketLen {
return cache[searchLen][idx*searchLen:idx*searchLen+searchLen] == strings.ToLower(searchString)
}
return false
}
// Matcher checks if a domain is in a list // Matcher checks if a domain is in a list
type Matcher interface { type Matcher interface {
// Match matches passed domain name against cached list entries // Match matches passed domain name against cached list entries
@ -73,7 +40,7 @@ type Matcher interface {
// ListCache generic cache of strings divided in groups // ListCache generic cache of strings divided in groups
type ListCache struct { type ListCache struct {
groupCaches map[string]stringCache groupCaches map[string]cache
lock sync.RWMutex lock sync.RWMutex
groupToLinks map[string][]string groupToLinks map[string][]string
@ -115,7 +82,7 @@ func (b *ListCache) Configuration() (result []string) {
// NewListCache creates new list instance // NewListCache creates new list instance
func NewListCache(t ListCacheType, groupToLinks map[string][]string, refreshPeriod time.Duration, func NewListCache(t ListCacheType, groupToLinks map[string][]string, refreshPeriod time.Duration,
downloadTimeout time.Duration) *ListCache { downloadTimeout time.Duration) *ListCache {
groupCaches := make(map[string]stringCache) groupCaches := make(map[string]cache)
timeout := downloadTimeout timeout := downloadTimeout
if downloadTimeout == 0 { if downloadTimeout == 0 {
@ -159,15 +126,12 @@ func logger() *logrus.Entry {
} }
// downloads and reads files with domain names and creates cache for them // downloads and reads files with domain names and creates cache for them
func (b *ListCache) createCacheForGroup(links []string) stringCache { func (b *ListCache) createCacheForGroup(links []string) cache {
cache := make(stringCache)
keys := make(map[string]struct{})
var wg sync.WaitGroup var wg sync.WaitGroup
c := make(chan []string, len(links)) c := make(chan []string, len(links))
// loop over links (http/local) or inline definitions
for _, link := range links { for _, link := range links {
wg.Add(1) wg.Add(1)
@ -176,7 +140,7 @@ func (b *ListCache) createCacheForGroup(links []string) stringCache {
wg.Wait() wg.Wait()
tmp := make(map[int]*strings.Builder) factory := newChainedCacheFactory()
Loop: Loop:
for { for {
@ -186,13 +150,7 @@ Loop:
return nil return nil
} }
for _, entry := range res { for _, entry := range res {
if _, value := keys[entry]; !value { factory.addEntry(entry)
keys[entry] = struct{}{}
if tmp[len(entry)] == nil {
tmp[len(entry)] = &strings.Builder{}
}
tmp[len(entry)].WriteString(entry)
}
} }
default: default:
close(c) close(c)
@ -200,16 +158,7 @@ Loop:
} }
} }
for k, v := range tmp { return factory.create()
chunks := util.Chunks(v.String(), k)
sort.Strings(chunks)
cache[k] = strings.Join(chunks, "")
v.Reset()
}
return cache
} }
// Match matches passed domain name against cached list entries // Match matches passed domain name against cached list entries
@ -218,7 +167,7 @@ func (b *ListCache) Match(domain string, groupsToCheck []string) (found bool, gr
defer b.lock.RUnlock() defer b.lock.RUnlock()
for _, g := range groupsToCheck { for _, g := range groupsToCheck {
if b.groupCaches[g].contains(domain) { if c, ok := b.groupCaches[g]; ok && c.contains(domain) {
return true, g return true, g
} }
} }

View File

@ -19,7 +19,6 @@ var _ = Describe("ListCache", func() {
emptyFile, file1, file2, file3 *os.File emptyFile, file1, file2, file3 *os.File
server1, server2, server3 *httptest.Server server1, server2, server3 *httptest.Server
) )
BeforeEach(func() { BeforeEach(func() {
emptyFile = TempFile("#empty file\n\n") emptyFile = TempFile("#empty file\n\n")
server1 = TestServer("blocked1.com\nblocked1a.com\n192.168.178.55") server1 = TestServer("blocked1.com\nblocked1a.com\n192.168.178.55")
@ -259,6 +258,23 @@ var _ = Describe("ListCache", func() {
Expect(group).Should(Equal("gr1")) Expect(group).Should(Equal("gr1"))
}) })
}) })
When("inline regex content is defined", func() {
It("should match", func() {
lists := map[string][]string{
"gr1": {"/^apple\\.(de|com)$/\n"},
}
sut := NewListCache(ListCacheTypeBlacklist, lists, 0, 0)
found, group := sut.Match("apple.com", []string{"gr1"})
Expect(found).Should(BeTrue())
Expect(group).Should(Equal("gr1"))
found, group = sut.Match("apple.de", []string{"gr1"})
Expect(found).Should(BeTrue())
Expect(group).Should(Equal("gr1"))
})
})
}) })
Describe("Configuration", func() { Describe("Configuration", func() {
When("refresh is enabled", func() { When("refresh is enabled", func() {