From 6d97f8b4582414b6ce69467656824690057d4793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Mon, 11 Mar 2024 22:10:47 -0700 Subject: [PATCH] Parse podcast categories --- internal/reader/itunes/itunes.go | 11 +++ internal/reader/rss/parser_test.go | 110 ++++++++++++++++++++++++----- internal/reader/rss/rss.go | 32 +++------ 3 files changed, 113 insertions(+), 40 deletions(-) diff --git a/internal/reader/itunes/itunes.go b/internal/reader/itunes/itunes.go index 0382493f..1673f306 100644 --- a/internal/reader/itunes/itunes.go +++ b/internal/reader/itunes/itunes.go @@ -22,6 +22,17 @@ type ItunesFeedElement struct { ItunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"` } +func (i *ItunesFeedElement) GetItunesCategories() []string { + var categories []string + for _, category := range i.ItunesCategories { + categories = append(categories, category.Text) + if category.SubCategory != nil { + categories = append(categories, category.SubCategory.Text) + } + } + return categories +} + type ItunesItemElement struct { ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` ItunesEpisode string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"` diff --git a/internal/reader/rss/parser_test.go b/internal/reader/rss/parser_test.go index a8fbc76f..e4ff09ed 100644 --- a/internal/reader/rss/parser_test.go +++ b/internal/reader/rss/parser_test.go @@ -1434,18 +1434,17 @@ func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) { } } -func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) { +func TestParseFeedWithCategories(t *testing.T) { data := ` Example https://example.org/ - + Category 1 + Test https://example.org/item - Category 1 - Category 2 ` @@ -1459,27 +1458,99 @@ func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) { t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags)) } - expected := "Category 2" - result := feed.Entries[0].Tags[1] - if result != expected { - t.Errorf("Incorrect entry category, got %q instead of %q", result, expected) + expected := []string{"Category 1", "Category 2"} + result := feed.Entries[0].Tags + + for i, tag := range result { + if tag != expected[i] { + t.Errorf("Incorrect tag, got: %q", tag) + } } } -func TestParseEntryWithCategoryAndCDATA(t *testing.T) { +func TestParseEntryWithCategories(t *testing.T) { data := ` Example https://example.org/ - + Category 3 + + Test + https://example.org/item + Category 1 + + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries[0].Tags) != 3 { + t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags)) + } + + expected := []string{"Category 1", "Category 2", "Category 3"} + result := feed.Entries[0].Tags + + for i, tag := range result { + if tag != expected[i] { + t.Errorf("Incorrect tag, got: %q", tag) + } + } +} + +func TestParseFeedWithItunesCategories(t *testing.T) { + data := ` + + + Example + https://example.org/ + + + + + + + + Test + https://example.org/item + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries[0].Tags) != 4 { + t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags)) + } + + expected := []string{"Society & Culture", "Documentary", "Health", "Mental Health"} + result := feed.Entries[0].Tags + + for i, tag := range result { + if tag != expected[i] { + t.Errorf("Incorrect tag, got: %q", tag) + } + } +} + +func TestParseFeedWithGooglePlayCategory(t *testing.T) { + data := ` + + + Example + https://example.org/ + Test https://example.org/item - - by - - Sample Category ` @@ -1493,10 +1564,13 @@ func TestParseEntryWithCategoryAndCDATA(t *testing.T) { t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags)) } - expected := "Sample Category" - result := feed.Entries[0].Tags[0] - if result != expected { - t.Errorf("Incorrect entry category, got %q instead of %q", result, expected) + expected := []string{"Art"} + result := feed.Entries[0].Tags + + for i, tag := range result { + if tag != expected[i] { + t.Errorf("Incorrect tag, got: %q", tag) + } } } diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index cd1442bd..be53c4b0 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -31,6 +31,7 @@ type rssFeed struct { } type rssChannel struct { + Categories []string `xml:"rss category"` Title string `xml:"rss title"` Link string `xml:"rss link"` ImageURL string `xml:"rss image>url"` @@ -111,6 +112,13 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed { entry.Title = entry.URL } + entry.Tags = append(entry.Tags, r.Channel.Categories...) + entry.Tags = append(entry.Tags, r.Channel.GetItunesCategories()...) + + if r.Channel.GooglePlayCategory.Text != "" { + entry.Tags = append(entry.Tags, r.Channel.GooglePlayCategory.Text) + } + feed.Entries = append(feed.Entries, entry) } @@ -165,12 +173,6 @@ type rssEnclosure struct { Length string `xml:"length,attr"` } -type rssCategory struct { - XMLName xml.Name - Data string `xml:",chardata"` - Inner string `xml:",innerxml"` -} - func (enclosure *rssEnclosure) Size() int64 { if enclosure.Length == "" { return 0 @@ -188,7 +190,7 @@ type rssItem struct { Author rssAuthor `xml:"rss author"` Comments string `xml:"rss comments"` EnclosureLinks []rssEnclosure `xml:"rss enclosure"` - Categories []rssCategory `xml:"rss category"` + Categories []string `xml:"rss category"` dublincore.DublinCoreItemElement FeedBurnerElement media.Element @@ -208,7 +210,7 @@ func (r *rssItem) Transform() *model.Entry { entry.Content = r.entryContent() entry.Title = r.entryTitle() entry.Enclosures = r.entryEnclosures() - entry.Tags = r.entryCategories() + entry.Tags = r.Categories if duration, err := normalizeDuration(r.ItunesDuration); err == nil { entry.ReadingTime = duration } @@ -383,20 +385,6 @@ func (r *rssItem) entryEnclosures() model.EnclosureList { return enclosures } -func (r *rssItem) entryCategories() []string { - categoryList := make([]string, 0) - - for _, rssCategory := range r.Categories { - if strings.Contains(rssCategory.Inner, "