From 4834e934f2cf57b106923bd37d62d6c5f6f39f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 15 Mar 2024 18:04:24 -0700 Subject: [PATCH] Remove some duplicated code in RSS parser --- internal/reader/rss/adapter.go | 10 +- internal/reader/rss/atom.go | 37 ++------ internal/reader/rss/parser_test.go | 100 ++++++++++++++++++++ internal/reader/rss/rss.go | 144 +++++++++++++++++++++++------ 4 files changed, 227 insertions(+), 64 deletions(-) diff --git a/internal/reader/rss/adapter.go b/internal/reader/rss/adapter.go index 2909fc6b..531cc53f 100644 --- a/internal/reader/rss/adapter.go +++ b/internal/reader/rss/adapter.go @@ -39,7 +39,7 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed { // Try to find the feed URL from the Atom links. for _, atomLink := range r.rss.Channel.AtomLinks.Links { - atomLinkHref := strings.TrimSpace(atomLink.URL) + atomLinkHref := strings.TrimSpace(atomLink.Href) if atomLinkHref != "" && atomLink.Rel == "self" { if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil { feed.FeedURL = absoluteFeedURL @@ -170,8 +170,8 @@ func findEntryURL(rssItem *RSSItem) string { } for _, atomLink := range rssItem.AtomLinks.Links { - if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { - return strings.TrimSpace(atomLink.URL) + if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { + return strings.TrimSpace(atomLink.Href) } } @@ -233,8 +233,8 @@ func findEntryAuthor(rssItem *RSSItem) string { author = rssItem.ItunesAuthor case rssItem.DublinCoreCreator != "": author = rssItem.DublinCoreCreator - case rssItem.AtomAuthor.String() != "": - author = rssItem.AtomAuthor.String() + case rssItem.AtomAuthor.PersonName() != "": + author = rssItem.AtomAuthor.PersonName() case strings.Contains(rssItem.Author.Inner, " + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + this is <b>bold</b> + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `this is bold` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryWithDescriptionWithHTMLCDATA(t *testing.T) { + data := ` + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + bold]]> + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `this is bold` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryDescriptionWithEncodingAngleBracketsInText(t *testing.T) { + data := ` + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + 5 &lt; 8, ticker symbol &lt;BIGCO&gt; + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `5 < 8, ticker symbol <BIGCO>` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + +// https://www.rssboard.org/rss-encoding-examples +func TestParseEntryDescriptionWithEncodingAngleBracketsWithinCDATASection(t *testing.T) { + data := ` + + + Example + http://example.org/ + + Item 1 + http://example.org/item1 + + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Content != `5 < 8, ticker symbol <BIGCO>` { + t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content) + } +} + func TestParseEntryWithFeedBurnerLink(t *testing.T) { data := ` diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index 7935166d..bc99b461 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -16,29 +16,75 @@ import ( // Specs: https://www.rssboard.org/rss-specification type RSS struct { - Version string `xml:"rss version,attr"` + // Version is the version of the RSS specification. + Version string `xml:"rss version,attr"` + + // Channel is the main container for the RSS feed. Channel RSSChannel `xml:"rss channel"` } type RSSChannel struct { - Title string `xml:"rss title"` - Link string `xml:"rss link"` - Description string `xml:"rss description"` - Language string `xml:"rss language"` - Copyright string `xml:"rss copyRight"` - ManagingEditor string `xml:"rss managingEditor"` - Webmaster string `xml:"rss webMaster"` - PubDate string `xml:"rss pubDate"` - LastBuildDate string `xml:"rss lastBuildDate"` - Categories []string `xml:"rss category"` - Generator string `xml:"rss generator"` - Docs string `xml:"rss docs"` - Cloud *RSSCloud `xml:"rss cloud"` - Image *RSSImage `xml:"rss image"` - TTL string `xml:"rss ttl"` - SkipHours []string `xml:"rss skipHours>hour"` - SkipDays []string `xml:"rss skipDays>day"` - Items []RSSItem `xml:"rss item"` + // Title is the name of the channel. + Title string `xml:"rss title"` + + // Link is the URL to the HTML website corresponding to the channel. + Link string `xml:"rss link"` + + // Description is a phrase or sentence describing the channel. + Description string `xml:"rss description"` + + // Language is the language the channel is written in. + // A list of allowable values for this element, as provided by Netscape, is here: https://www.rssboard.org/rss-language-codes. + // You may also use values defined by the W3C: https://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes. + Language string `xml:"rss language"` + + // Copyright is a string indicating the copyright. + Copyright string `xml:"rss copyRight"` + + // ManagingEditor is the email address for the person responsible for editorial content. + ManagingEditor string `xml:"rss managingEditor"` + + // Webmaster is the email address for the person responsible for technical issues relating to the channel. + Webmaster string `xml:"rss webMaster"` + + // PubDate is the publication date for the content in the channel. + // All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred). + PubDate string `xml:"rss pubDate"` + + // LastBuildDate is the last time the content of the channel changed. + LastBuildDate string `xml:"rss lastBuildDate"` + + // Categories is a collection of categories to which the channel belongs. + Categories []string `xml:"rss category"` + + // Generator is a string indicating the program used to generate the channel. + Generator string `xml:"rss generator"` + + // Docs is a URL that points to the documentation for the format used in the RSS file. + DocumentationURL string `xml:"rss docs"` + + // Cloud is a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1. + Cloud *RSSCloud `xml:"rss cloud"` + + // Image specifies a GIF, JPEG or PNG image that can be displayed with the channel. + Image *RSSImage `xml:"rss image"` + + // TTL is a number of minutes that indicates how long a channel can be cached before refreshing from the source. + TTL string `xml:"rss ttl"` + + // SkipHours is a hint for aggregators telling them which hours they can skip. + // An XML element that contains up to 24 sub-elements whose value is a number between 0 and 23, + // representing a time in GMT, when aggregators, + // if they support the feature, may not read the channel on hours listed in the skipHours element. + SkipHours []string `xml:"rss skipHours>hour"` + + // SkipDays is a hint for aggregators telling them which days they can skip. + // An XML element that contains up to seven sub-elements whose value is Monday, Tuesday, Wednesday, Thursday, Friday, Saturday or Sunday. + SkipDays []string `xml:"rss skipDays>day"` + + // Items is a collection of items. + Items []RSSItem `xml:"rss item"` + AtomLinks itunes.ItunesChannelElement googleplay.GooglePlayChannelElement @@ -64,16 +110,56 @@ type RSSImage struct { } type RSSItem struct { - Title string `xml:"rss title"` - Link string `xml:"rss link"` - Description string `xml:"rss description"` - Author RSSAuthor `xml:"rss author"` - Categories []string `xml:"rss category"` - CommentsURL string `xml:"rss comments"` - Enclosures []RSSEnclosure `xml:"rss enclosure"` - GUID RSSGUID `xml:"rss guid"` - PubDate string `xml:"rss pubDate"` - Source RSSSource `xml:"rss source"` + // Title is the title of the item. + Title string `xml:"rss title"` + + // Link is the URL of the item. + Link string `xml:"rss link"` + + // Description is the item synopsis. + Description string `xml:"rss description"` + + // Author is the email address of the author of the item. + Author RSSAuthor `xml:"rss author"` + + // is an optional sub-element of . + // It has one optional attribute, domain, a string that identifies a categorization taxonomy. + Categories []string `xml:"rss category"` + + // is an optional sub-element of . + // If present, it contains the URL of the comments page for the item. + CommentsURL string `xml:"rss comments"` + + // is an optional sub-element of . + // It has three required attributes. url says where the enclosure is located, + // length says how big it is in bytes, and type says what its type is, a standard MIME type. + Enclosures []RSSEnclosure `xml:"rss enclosure"` + + // is an optional sub-element of . + // It's a string that uniquely identifies the item. + // When present, an aggregator may choose to use this string to determine if an item is new. + // + // There are no rules for the syntax of a guid. + // Aggregators must view them as a string. + // It's up to the source of the feed to establish the uniqueness of the string. + // + // If the guid element has an attribute named isPermaLink with a value of true, + // the reader may assume that it is a permalink to the item, that is, a url that can be opened in a Web browser, + // that points to the full item described by the element. + // + // isPermaLink is optional, its default value is true. + // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular. + GUID RSSGUID `xml:"rss guid"` + + // is the publication date of the item. + // Its value is a string in RFC 822 format. + PubDate string `xml:"rss pubDate"` + + // is an optional sub-element of . + // Its value is the name of the RSS channel that the item came from, derived from its . + // It has one required attribute, url, which contains the URL of the RSS channel. + Source RSSSource `xml:"rss source"` + dublincore.DublinCoreItemElement FeedBurnerItemElement media.MediaItemElement