Return outer HTML when scraping elements

This commit is contained in:
cinput 2019-12-21 21:18:31 -08:00 committed by Frédéric Guillot
parent 30f22fbd78
commit 8e1ed8bef3
8 changed files with 73 additions and 8 deletions

View File

@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) {
document.Find(rules).Each(func(i int, s *goquery.Selection) {
var content string
// For some inline elements, we get the parent.
if s.Is("img") || s.Is("iframe") {
content, _ = s.Parent().Html()
} else {
content, _ = s.Html()
}
content, _ = goquery.OuterHtml(s)
contents += content
})

View File

@ -4,7 +4,12 @@
package scraper // import "miniflux.app/reader/scraper"
import "testing"
import (
"bytes"
"io/ioutil"
"strings"
"testing"
)
func TestGetPredefinedRules(t *testing.T) {
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) {
}
}
}
func TestSelectorRules(t *testing.T) {
var ruleTestCases = map[string]string {
"img.html": "article > img",
"iframe.html": "article > iframe",
"p.html": "article > p",
}
for filename, rule := range ruleTestCases {
html, err := ioutil.ReadFile("testdata/" + filename)
if err != nil {
t.Fatalf(`Unable to read file %q: %v`, filename, err)
}
actualResult, err := scrapContent(bytes.NewReader(html), rule)
if err != nil {
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
}
expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result")
if err != nil {
t.Fatalf(`Unable to read file %q: %v`, filename, err)
}
if actualResult != strings.TrimSpace(string(expectedResult)) {
t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
}
}
}

12
reader/scraper/testdata/iframe.html vendored Normal file
View File

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en-US">
<body>
<article>
<iframe id="1" src="about:blank"></iframe>
<iframe id="2" src="about:blank"></iframe>
<iframe id="3" src="about:blank"></iframe>
<iframe id="4" src="about:blank"></iframe>
<iframe id="5" src="about:blank"></iframe>
</article>
</body>
</html>

View File

@ -0,0 +1 @@
<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>

12
reader/scraper/testdata/img.html vendored Normal file
View File

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en-US">
<body>
<article>
<img id="1" src="#" alt="" />
<img id="2" src="#" alt="" />
<img id="3" src="#" alt="" />
<img id="4" src="#" alt="" />
<img id="5" src="#" alt="" />
</article>
</body>
</html>

View File

@ -0,0 +1 @@
<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>

10
reader/scraper/testdata/p.html vendored Normal file
View File

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html lang="en-US">
<body>
<article>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p>
<p>Apquam tincidunt mauris eu risus.</p>
<p>Vestibulum auctor dapibus neque.</p>
</article>
</body>
</html>

1
reader/scraper/testdata/p.html-result vendored Normal file
View File

@ -0,0 +1 @@
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>