Deduplicate feed URLs when parsing HTML document during discovery process

Fixes #2232
This commit is contained in:
Frédéric Guillot 2023-12-01 13:35:24 -08:00
parent bfa83cbf99
commit 5de0714256
2 changed files with 41 additions and 2 deletions

View File

@ -152,6 +152,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
}
var subscriptions Subscriptions
subscriptionURLs := make(map[string]bool)
for query, kind := range queries {
doc.Find(query).Each(func(i int, s *goquery.Selection) {
subscription := new(Subscription)
@ -163,7 +164,10 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
if feedURL, exists := s.Attr("href"); exists {
if feedURL != "" {
subscription.URL, _ = urllib.AbsoluteURL(websiteURL, feedURL)
subscription.URL, err = urllib.AbsoluteURL(websiteURL, feedURL)
if err != nil {
return
}
}
}
@ -171,7 +175,8 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
subscription.Title = subscription.URL
}
if subscription.URL != "" {
if subscription.URL != "" && !subscriptionURLs[subscription.URL] {
subscriptionURLs[subscription.URL] = true
subscriptions = append(subscriptions, subscription)
}
})

View File

@ -249,6 +249,40 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) {
}
}
func TestParseWebPageWithDuplicatedFeeds(t *testing.T) {
htmlPage := `
<!doctype html>
<html>
<head>
<link href="http://example.org/feed.xml" rel="alternate" type="application/rss+xml" title="Feed A">
<link href="http://example.org/feed.xml" rel="alternate" type="application/rss+xml" title="Feed B">
</head>
<body>
</body>
</html>`
subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
}
if len(subscriptions) != 1 {
t.Fatal(`Incorrect number of subscriptions returned`)
}
if subscriptions[0].Title != "Feed A" {
t.Errorf(`Incorrect subscription title: %q`, subscriptions[0].Title)
}
if subscriptions[0].URL != "http://example.org/feed.xml" {
t.Errorf(`Incorrect subscription URL: %q`, subscriptions[0].URL)
}
if subscriptions[0].Type != "rss" {
t.Errorf(`Incorrect subscription type: %q`, subscriptions[0].Type)
}
}
func TestParseWebPageWithEmptyFeedURL(t *testing.T) {
htmlPage := `
<!doctype html>