Remove some duplicated code in RSS parser

2024-03-15 18:04:24 -07:00 · 2024-03-15 18:04:24 -07:00 · 4834e934f2
parent dd4fb660c1
commit 4834e934f2
4 changed files with 227 additions and 64 deletions
--- a/internal/reader/rss/adapter.go
+++ b/internal/reader/rss/adapter.go
@ -39,7 +39,7 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
 	// Try to find the feed URL from the Atom links.
 	for _, atomLink := range r.rss.Channel.AtomLinks.Links {
-		atomLinkHref := strings.TrimSpace(atomLink.URL)
+		atomLinkHref := strings.TrimSpace(atomLink.Href)
 		if atomLinkHref != "" && atomLink.Rel == "self" {
 			if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil {
 				feed.FeedURL = absoluteFeedURL
@ -170,8 +170,8 @@ func findEntryURL(rssItem *RSSItem) string {
 	}
 	for _, atomLink := range rssItem.AtomLinks.Links {
-		if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
+		if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
-			return strings.TrimSpace(atomLink.URL)
+			return strings.TrimSpace(atomLink.Href)
 		}
 	}
@ -233,8 +233,8 @@ func findEntryAuthor(rssItem *RSSItem) string {
 		author = rssItem.ItunesAuthor
 	case rssItem.DublinCoreCreator != "":
 		author = rssItem.DublinCoreCreator
-	case rssItem.AtomAuthor.String() != "":
+	case rssItem.AtomAuthor.PersonName() != "":
-		author = rssItem.AtomAuthor.String()
+		author = rssItem.AtomAuthor.PersonName()
 	case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
 		author = rssItem.Author.Data
 	default:
--- a/internal/reader/rss/atom.go
+++ b/internal/reader/rss/atom.go
@ -3,41 +3,18 @@
 package rss // import "miniflux.app/v2/internal/reader/rss"
-import "strings"
+import (
 	"miniflux.app/v2/internal/reader/atom"
 )
 type AtomAuthor struct {
-	Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
+	Author atom.AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
 }
-func (a *AtomAuthor) String() string {
+func (a *AtomAuthor) PersonName() string {
-	return a.Author.String()
+	return a.Author.PersonName()
 }
 type AtomPerson struct {
 	Name  string `xml:"name"`
 	Email string `xml:"email"`
 }
 func (a *AtomPerson) String() string {
 	var name string
 	switch {
 	case a.Name != "":
 		name = a.Name
 	case a.Email != "":
 		name = a.Email
 	}
 	return strings.TrimSpace(name)
 }
 type AtomLink struct {
 	URL    string `xml:"href,attr"`
 	Type   string `xml:"type,attr"`
 	Rel    string `xml:"rel,attr"`
 	Length string `xml:"length,attr"`
 }
 type AtomLinks struct {
-	Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
+	Links []*atom.AtomLink `xml:"http://www.w3.org/2005/Atom link"`
 }
--- a/internal/reader/rss/parser_test.go
+++ b/internal/reader/rss/parser_test.go
@ -746,6 +746,106 @@ func TestParseEntryWithContentEncoded(t *testing.T) {
 	}
 }
 // https://www.rssboard.org/rss-encoding-examples
 func TestParseEntryDescriptionWithEncodedHTMLTags(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
 		<channel>
 			<title>Example</title>
 			<link>http://example.org/</link>
 			<item>
 				<title>Item 1</title>
 				<link>http://example.org/item1</link>
 				<description>this is &lt;b&gt;bold&lt;/b&gt;</description>
 			</item>
 		</channel>
 	</rss>`
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if feed.Entries[0].Content != `this is <b>bold</b>` {
 		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
 	}
 }
 // https://www.rssboard.org/rss-encoding-examples
 func TestParseEntryWithDescriptionWithHTMLCDATA(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
 		<channel>
 			<title>Example</title>
 			<link>http://example.org/</link>
 			<item>
 				<title>Item 1</title>
 				<link>http://example.org/item1</link>
 				<description><![CDATA[this is <b>bold</b>]]></description>
 			</item>
 		</channel>
 	</rss>`
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if feed.Entries[0].Content != `this is <b>bold</b>` {
 		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
 	}
 }
 // https://www.rssboard.org/rss-encoding-examples
 func TestParseEntryDescriptionWithEncodingAngleBracketsInText(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
 		<channel>
 			<title>Example</title>
 			<link>http://example.org/</link>
 			<item>
 				<title>Item 1</title>
 				<link>http://example.org/item1</link>
 				<description>5 &amp;lt; 8, ticker symbol &amp;lt;BIGCO&amp;gt;</description>
 			</item>
 		</channel>
 	</rss>`
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if feed.Entries[0].Content != `5 &lt; 8, ticker symbol &lt;BIGCO&gt;` {
 		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
 	}
 }
 // https://www.rssboard.org/rss-encoding-examples
 func TestParseEntryDescriptionWithEncodingAngleBracketsWithinCDATASection(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
 		<channel>
 			<title>Example</title>
 			<link>http://example.org/</link>
 			<item>
 				<title>Item 1</title>
 				<link>http://example.org/item1</link>
 				<description><![CDATA[5 &lt; 8, ticker symbol &lt;BIGCO&gt;]]></description>
 			</item>
 		</channel>
 	</rss>`
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if feed.Entries[0].Content != `5 &lt; 8, ticker symbol &lt;BIGCO&gt;` {
 		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
 	}
 }
 func TestParseEntryWithFeedBurnerLink(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">
--- a/internal/reader/rss/rss.go
+++ b/internal/reader/rss/rss.go
@ -16,29 +16,75 @@ import (
 // Specs: https://www.rssboard.org/rss-specification
 type RSS struct {
-	Version string     `xml:"rss version,attr"`
+	// Version is the version of the RSS specification.
 	Version string `xml:"rss version,attr"`
 	// Channel is the main container for the RSS feed.
 	Channel RSSChannel `xml:"rss channel"`
 }
 type RSSChannel struct {
-	Title          string    `xml:"rss title"`
+	// Title is the name of the channel.
-	Link           string    `xml:"rss link"`
+	Title string `xml:"rss title"`
-	Description    string    `xml:"rss description"`
+
-	Language       string    `xml:"rss language"`
+	// Link is the URL to the HTML website corresponding to the channel.
-	Copyright      string    `xml:"rss copyRight"`
+	Link string `xml:"rss link"`
-	ManagingEditor string    `xml:"rss managingEditor"`
+
-	Webmaster      string    `xml:"rss webMaster"`
+	// Description is a phrase or sentence describing the channel.
-	PubDate        string    `xml:"rss pubDate"`
+	Description string `xml:"rss description"`
-	LastBuildDate  string    `xml:"rss lastBuildDate"`
+
-	Categories     []string  `xml:"rss category"`
+	// Language is the language the channel is written in.
-	Generator      string    `xml:"rss generator"`
+	// A list of allowable values for this element, as provided by Netscape, is here: https://www.rssboard.org/rss-language-codes.
-	Docs           string    `xml:"rss docs"`
+	// You may also use values defined by the W3C: https://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes.
-	Cloud          *RSSCloud `xml:"rss cloud"`
+	Language string `xml:"rss language"`
-	Image          *RSSImage `xml:"rss image"`
+
-	TTL            string    `xml:"rss ttl"`
+	// Copyright is a string indicating the copyright.
-	SkipHours      []string  `xml:"rss skipHours>hour"`
+	Copyright string `xml:"rss copyRight"`
-	SkipDays       []string  `xml:"rss skipDays>day"`
+
-	Items          []RSSItem `xml:"rss item"`
+	// ManagingEditor is the email address for the person responsible for editorial content.
 	ManagingEditor string `xml:"rss managingEditor"`
 	// Webmaster is the email address for the person responsible for technical issues relating to the channel.
 	Webmaster string `xml:"rss webMaster"`
 	// PubDate is the publication date for the content in the channel.
 	// All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred).
 	PubDate string `xml:"rss pubDate"`
 	// LastBuildDate is the last time the content of the channel changed.
 	LastBuildDate string `xml:"rss lastBuildDate"`
 	// Categories is a collection of categories to which the channel belongs.
 	Categories []string `xml:"rss category"`
 	// Generator is a string indicating the program used to generate the channel.
 	Generator string `xml:"rss generator"`
 	// Docs is a URL that points to the documentation for the format used in the RSS file.
 	DocumentationURL string `xml:"rss docs"`
 	// Cloud is a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1.
 	Cloud *RSSCloud `xml:"rss cloud"`
 	// Image specifies a GIF, JPEG or PNG image that can be displayed with the channel.
 	Image *RSSImage `xml:"rss image"`
 	// TTL is a number of minutes that indicates how long a channel can be cached before refreshing from the source.
 	TTL string `xml:"rss ttl"`
 	// SkipHours is a hint for aggregators telling them which hours they can skip.
 	// An XML element that contains up to 24 <hour> sub-elements whose value is a number between 0 and 23,
 	// representing a time in GMT, when aggregators,
 	// if they support the feature, may not read the channel on hours listed in the skipHours element.
 	SkipHours []string `xml:"rss skipHours>hour"`
 	// SkipDays is a hint for aggregators telling them which days they can skip.
 	// An XML element that contains up to seven <day> sub-elements whose value is Monday, Tuesday, Wednesday, Thursday, Friday, Saturday or Sunday.
 	SkipDays []string `xml:"rss skipDays>day"`
 	// Items is a collection of items.
 	Items []RSSItem `xml:"rss item"`
 	AtomLinks
 	itunes.ItunesChannelElement
 	googleplay.GooglePlayChannelElement
@ -64,16 +110,56 @@ type RSSImage struct {
 }
 type RSSItem struct {
-	Title       string         `xml:"rss title"`
+	// Title is the title of the item.
-	Link        string         `xml:"rss link"`
+	Title string `xml:"rss title"`
-	Description string         `xml:"rss description"`
+
-	Author      RSSAuthor      `xml:"rss author"`
+	// Link is the URL of the item.
-	Categories  []string       `xml:"rss category"`
+	Link string `xml:"rss link"`
-	CommentsURL string         `xml:"rss comments"`
+
-	Enclosures  []RSSEnclosure `xml:"rss enclosure"`
+	// Description is the item synopsis.
-	GUID        RSSGUID        `xml:"rss guid"`
+	Description string `xml:"rss description"`
-	PubDate     string         `xml:"rss pubDate"`
+
-	Source      RSSSource      `xml:"rss source"`
+	// Author is the email address of the author of the item.
 	Author RSSAuthor `xml:"rss author"`
 	// <category> is an optional sub-element of <item>.
 	// It has one optional attribute, domain, a string that identifies a categorization taxonomy.
 	Categories []string `xml:"rss category"`
 	// <comments> is an optional sub-element of <item>.
 	// If present, it contains the URL of the comments page for the item.
 	CommentsURL string `xml:"rss comments"`
 	// <enclosure> is an optional sub-element of <item>.
 	// It has three required attributes. url says where the enclosure is located,
 	// length says how big it is in bytes, and type says what its type is, a standard MIME type.
 	Enclosures []RSSEnclosure `xml:"rss enclosure"`
 	// <guid> is an optional sub-element of <item>.
 	// It's a string that uniquely identifies the item.
 	// When present, an aggregator may choose to use this string to determine if an item is new.
 	//
 	// There are no rules for the syntax of a guid.
 	// Aggregators must view them as a string.
 	// It's up to the source of the feed to establish the uniqueness of the string.
 	//
 	// If the guid element has an attribute named isPermaLink with a value of true,
 	// the reader may assume that it is a permalink to the item, that is, a url that can be opened in a Web browser,
 	// that points to the full item described by the <item> element.
 	//
 	// isPermaLink is optional, its default value is true.
 	// If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
 	GUID RSSGUID `xml:"rss guid"`
 	// <pubDate> is the publication date of the item.
 	// Its value is a string in RFC 822 format.
 	PubDate string `xml:"rss pubDate"`
 	// <source> is an optional sub-element of <item>.
 	// Its value is the name of the RSS channel that the item came from, derived from its <title>.
 	// It has one required attribute, url, which contains the URL of the RSS channel.
 	Source RSSSource `xml:"rss source"`
 	dublincore.DublinCoreItemElement
 	FeedBurnerItemElement
 	media.MediaItemElement