Remove some duplicated code in RSS parser

This commit is contained in:
Frédéric Guillot 2024-03-15 18:04:24 -07:00
parent dd4fb660c1
commit 4834e934f2
4 changed files with 227 additions and 64 deletions

View File

@ -39,7 +39,7 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
// Try to find the feed URL from the Atom links. // Try to find the feed URL from the Atom links.
for _, atomLink := range r.rss.Channel.AtomLinks.Links { for _, atomLink := range r.rss.Channel.AtomLinks.Links {
atomLinkHref := strings.TrimSpace(atomLink.URL) atomLinkHref := strings.TrimSpace(atomLink.Href)
if atomLinkHref != "" && atomLink.Rel == "self" { if atomLinkHref != "" && atomLink.Rel == "self" {
if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil { if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil {
feed.FeedURL = absoluteFeedURL feed.FeedURL = absoluteFeedURL
@ -170,8 +170,8 @@ func findEntryURL(rssItem *RSSItem) string {
} }
for _, atomLink := range rssItem.AtomLinks.Links { for _, atomLink := range rssItem.AtomLinks.Links {
if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") { if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
return strings.TrimSpace(atomLink.URL) return strings.TrimSpace(atomLink.Href)
} }
} }
@ -233,8 +233,8 @@ func findEntryAuthor(rssItem *RSSItem) string {
author = rssItem.ItunesAuthor author = rssItem.ItunesAuthor
case rssItem.DublinCoreCreator != "": case rssItem.DublinCoreCreator != "":
author = rssItem.DublinCoreCreator author = rssItem.DublinCoreCreator
case rssItem.AtomAuthor.String() != "": case rssItem.AtomAuthor.PersonName() != "":
author = rssItem.AtomAuthor.String() author = rssItem.AtomAuthor.PersonName()
case strings.Contains(rssItem.Author.Inner, "<![CDATA["): case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
author = rssItem.Author.Data author = rssItem.Author.Data
default: default:

View File

@ -3,41 +3,18 @@
package rss // import "miniflux.app/v2/internal/reader/rss" package rss // import "miniflux.app/v2/internal/reader/rss"
import "strings" import (
"miniflux.app/v2/internal/reader/atom"
)
type AtomAuthor struct { type AtomAuthor struct {
Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"` Author atom.AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
} }
func (a *AtomAuthor) String() string { func (a *AtomAuthor) PersonName() string {
return a.Author.String() return a.Author.PersonName()
}
type AtomPerson struct {
Name string `xml:"name"`
Email string `xml:"email"`
}
func (a *AtomPerson) String() string {
var name string
switch {
case a.Name != "":
name = a.Name
case a.Email != "":
name = a.Email
}
return strings.TrimSpace(name)
}
type AtomLink struct {
URL string `xml:"href,attr"`
Type string `xml:"type,attr"`
Rel string `xml:"rel,attr"`
Length string `xml:"length,attr"`
} }
type AtomLinks struct { type AtomLinks struct {
Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"` Links []*atom.AtomLink `xml:"http://www.w3.org/2005/Atom link"`
} }

View File

@ -746,6 +746,106 @@ func TestParseEntryWithContentEncoded(t *testing.T) {
} }
} }
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryDescriptionWithEncodedHTMLTags(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description>this is &lt;b&gt;bold&lt;/b&gt;</description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `this is <b>bold</b>` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryWithDescriptionWithHTMLCDATA(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description><![CDATA[this is <b>bold</b>]]></description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `this is <b>bold</b>` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryDescriptionWithEncodingAngleBracketsInText(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description>5 &amp;lt; 8, ticker symbol &amp;lt;BIGCO&amp;gt;</description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `5 &lt; 8, ticker symbol &lt;BIGCO&gt;` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
// https://www.rssboard.org/rss-encoding-examples
func TestParseEntryDescriptionWithEncodingAngleBracketsWithinCDATASection(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Example</title>
<link>http://example.org/</link>
<item>
<title>Item 1</title>
<link>http://example.org/item1</link>
<description><![CDATA[5 &lt; 8, ticker symbol &lt;BIGCO&gt;]]></description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Content != `5 &lt; 8, ticker symbol &lt;BIGCO&gt;` {
t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
}
}
func TestParseEntryWithFeedBurnerLink(t *testing.T) { func TestParseEntryWithFeedBurnerLink(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?> data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0"> <rss version="2.0" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">

View File

@ -16,29 +16,75 @@ import (
// Specs: https://www.rssboard.org/rss-specification // Specs: https://www.rssboard.org/rss-specification
type RSS struct { type RSS struct {
Version string `xml:"rss version,attr"` // Version is the version of the RSS specification.
Version string `xml:"rss version,attr"`
// Channel is the main container for the RSS feed.
Channel RSSChannel `xml:"rss channel"` Channel RSSChannel `xml:"rss channel"`
} }
type RSSChannel struct { type RSSChannel struct {
Title string `xml:"rss title"` // Title is the name of the channel.
Link string `xml:"rss link"` Title string `xml:"rss title"`
Description string `xml:"rss description"`
Language string `xml:"rss language"` // Link is the URL to the HTML website corresponding to the channel.
Copyright string `xml:"rss copyRight"` Link string `xml:"rss link"`
ManagingEditor string `xml:"rss managingEditor"`
Webmaster string `xml:"rss webMaster"` // Description is a phrase or sentence describing the channel.
PubDate string `xml:"rss pubDate"` Description string `xml:"rss description"`
LastBuildDate string `xml:"rss lastBuildDate"`
Categories []string `xml:"rss category"` // Language is the language the channel is written in.
Generator string `xml:"rss generator"` // A list of allowable values for this element, as provided by Netscape, is here: https://www.rssboard.org/rss-language-codes.
Docs string `xml:"rss docs"` // You may also use values defined by the W3C: https://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes.
Cloud *RSSCloud `xml:"rss cloud"` Language string `xml:"rss language"`
Image *RSSImage `xml:"rss image"`
TTL string `xml:"rss ttl"` // Copyright is a string indicating the copyright.
SkipHours []string `xml:"rss skipHours>hour"` Copyright string `xml:"rss copyRight"`
SkipDays []string `xml:"rss skipDays>day"`
Items []RSSItem `xml:"rss item"` // ManagingEditor is the email address for the person responsible for editorial content.
ManagingEditor string `xml:"rss managingEditor"`
// Webmaster is the email address for the person responsible for technical issues relating to the channel.
Webmaster string `xml:"rss webMaster"`
// PubDate is the publication date for the content in the channel.
// All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred).
PubDate string `xml:"rss pubDate"`
// LastBuildDate is the last time the content of the channel changed.
LastBuildDate string `xml:"rss lastBuildDate"`
// Categories is a collection of categories to which the channel belongs.
Categories []string `xml:"rss category"`
// Generator is a string indicating the program used to generate the channel.
Generator string `xml:"rss generator"`
// Docs is a URL that points to the documentation for the format used in the RSS file.
DocumentationURL string `xml:"rss docs"`
// Cloud is a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1.
Cloud *RSSCloud `xml:"rss cloud"`
// Image specifies a GIF, JPEG or PNG image that can be displayed with the channel.
Image *RSSImage `xml:"rss image"`
// TTL is a number of minutes that indicates how long a channel can be cached before refreshing from the source.
TTL string `xml:"rss ttl"`
// SkipHours is a hint for aggregators telling them which hours they can skip.
// An XML element that contains up to 24 <hour> sub-elements whose value is a number between 0 and 23,
// representing a time in GMT, when aggregators,
// if they support the feature, may not read the channel on hours listed in the skipHours element.
SkipHours []string `xml:"rss skipHours>hour"`
// SkipDays is a hint for aggregators telling them which days they can skip.
// An XML element that contains up to seven <day> sub-elements whose value is Monday, Tuesday, Wednesday, Thursday, Friday, Saturday or Sunday.
SkipDays []string `xml:"rss skipDays>day"`
// Items is a collection of items.
Items []RSSItem `xml:"rss item"`
AtomLinks AtomLinks
itunes.ItunesChannelElement itunes.ItunesChannelElement
googleplay.GooglePlayChannelElement googleplay.GooglePlayChannelElement
@ -64,16 +110,56 @@ type RSSImage struct {
} }
type RSSItem struct { type RSSItem struct {
Title string `xml:"rss title"` // Title is the title of the item.
Link string `xml:"rss link"` Title string `xml:"rss title"`
Description string `xml:"rss description"`
Author RSSAuthor `xml:"rss author"` // Link is the URL of the item.
Categories []string `xml:"rss category"` Link string `xml:"rss link"`
CommentsURL string `xml:"rss comments"`
Enclosures []RSSEnclosure `xml:"rss enclosure"` // Description is the item synopsis.
GUID RSSGUID `xml:"rss guid"` Description string `xml:"rss description"`
PubDate string `xml:"rss pubDate"`
Source RSSSource `xml:"rss source"` // Author is the email address of the author of the item.
Author RSSAuthor `xml:"rss author"`
// <category> is an optional sub-element of <item>.
// It has one optional attribute, domain, a string that identifies a categorization taxonomy.
Categories []string `xml:"rss category"`
// <comments> is an optional sub-element of <item>.
// If present, it contains the URL of the comments page for the item.
CommentsURL string `xml:"rss comments"`
// <enclosure> is an optional sub-element of <item>.
// It has three required attributes. url says where the enclosure is located,
// length says how big it is in bytes, and type says what its type is, a standard MIME type.
Enclosures []RSSEnclosure `xml:"rss enclosure"`
// <guid> is an optional sub-element of <item>.
// It's a string that uniquely identifies the item.
// When present, an aggregator may choose to use this string to determine if an item is new.
//
// There are no rules for the syntax of a guid.
// Aggregators must view them as a string.
// It's up to the source of the feed to establish the uniqueness of the string.
//
// If the guid element has an attribute named isPermaLink with a value of true,
// the reader may assume that it is a permalink to the item, that is, a url that can be opened in a Web browser,
// that points to the full item described by the <item> element.
//
// isPermaLink is optional, its default value is true.
// If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
GUID RSSGUID `xml:"rss guid"`
// <pubDate> is the publication date of the item.
// Its value is a string in RFC 822 format.
PubDate string `xml:"rss pubDate"`
// <source> is an optional sub-element of <item>.
// Its value is the name of the RSS channel that the item came from, derived from its <title>.
// It has one required attribute, url, which contains the URL of the RSS channel.
Source RSSSource `xml:"rss source"`
dublincore.DublinCoreItemElement dublincore.DublinCoreItemElement
FeedBurnerItemElement FeedBurnerItemElement
media.MediaItemElement media.MediaItemElement