diff --git a/go.mod b/go.mod index ec90e70e..e6256ecb 100644 --- a/go.mod +++ b/go.mod @@ -4,12 +4,12 @@ module miniflux.app/v2 require ( github.com/PuerkitoBio/goquery v1.8.1 + github.com/abadojack/whatlanggo v1.0.1 github.com/coreos/go-oidc/v3 v3.6.0 github.com/gorilla/mux v1.8.0 github.com/lib/pq v1.10.9 github.com/mccutchen/go-httpbin/v2 v2.11.1 github.com/prometheus/client_golang v1.17.0 - github.com/rylans/getlang v0.0.0-20201227074721-9e7f44ff8aa0 github.com/tdewolff/minify/v2 v2.12.9 github.com/yuin/goldmark v1.5.6 golang.org/x/crypto v0.14.0 @@ -29,6 +29,7 @@ require ( github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/procfs v0.11.1 // indirect + github.com/stretchr/testify v1.8.4 // indirect github.com/tdewolff/parse/v2 v2.6.8 // indirect golang.org/x/sys v0.13.0 // indirect golang.org/x/text v0.13.0 // indirect diff --git a/go.sum b/go.sum index 21d0762f..8df4025a 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4= +github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -40,12 +42,10 @@ github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdO github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= -github.com/rylans/getlang v0.0.0-20201227074721-9e7f44ff8aa0 h1:qSaU9YAEIxk/ozcmY1hiauktAYTpbwYIrPdQ0L2E8UM= -github.com/rylans/getlang v0.0.0-20201227074721-9e7f44ff8aa0/go.mod h1:3vfmZI6aJd5Rb9W2TQ0Nmupl+qem21R05+hmCscI0Bk= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/tdewolff/minify/v2 v2.12.9 h1:dvn5MtmuQ/DFMwqf5j8QhEVpPX6fi3WGImhv8RUB4zA= github.com/tdewolff/minify/v2 v2.12.9/go.mod h1:qOqdlDfL+7v0/fyymB+OP497nIxJYSvX4MQWA8OoiXU= github.com/tdewolff/parse/v2 v2.6.8 h1:mhNZXYCx//xG7Yq2e/kVLNZw4YfYmeHbhx+Zc0OvFMA= @@ -110,7 +110,8 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8= mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE= diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index c6514bf6..2fa85a7c 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -7,25 +7,22 @@ import ( "errors" "fmt" "log/slog" - "math" "regexp" "strconv" - "strings" "time" - "unicode/utf8" "miniflux.app/v2/internal/config" "miniflux.app/v2/internal/http/client" "miniflux.app/v2/internal/metric" "miniflux.app/v2/internal/model" "miniflux.app/v2/internal/reader/browser" + "miniflux.app/v2/internal/reader/readingtime" "miniflux.app/v2/internal/reader/rewrite" "miniflux.app/v2/internal/reader/sanitizer" "miniflux.app/v2/internal/reader/scraper" "miniflux.app/v2/internal/storage" "github.com/PuerkitoBio/goquery" - "github.com/rylans/getlang" ) var ( @@ -174,7 +171,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) if content != "" { entry.Content = content - entry.ReadingTime = calculateReadingTime(content, user) + entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed) } rewrite.Rewriter(url, entry, entry.Feed.RewriteRules) @@ -252,7 +249,7 @@ func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *mod } // Handle YT error case and non-YT entries. if entry.ReadingTime == 0 { - entry.ReadingTime = calculateReadingTime(entry.Content, user) + entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed) } } @@ -360,18 +357,3 @@ func parseISO8601(from string) (time.Duration, error) { return d, nil } - -func calculateReadingTime(content string, user *model.User) int { - sanitizedContent := sanitizer.StripTags(content) - languageInfo := getlang.FromString(sanitizedContent) - - var timeToReadInt int - if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" { - timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(user.CJKReadingSpeed))) - } else { - nbOfWords := len(strings.Fields(sanitizedContent)) - timeToReadInt = int(math.Ceil(float64(nbOfWords) / float64(user.DefaultReadingSpeed))) - } - - return timeToReadInt -} diff --git a/internal/reader/readingtime/readingtime.go b/internal/reader/readingtime/readingtime.go new file mode 100644 index 00000000..faf78471 --- /dev/null +++ b/internal/reader/readingtime/readingtime.go @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Package readtime provides a function to estimate the reading time of an article. +package readingtime + +import ( + "math" + "strings" + "unicode/utf8" + + "miniflux.app/v2/internal/reader/sanitizer" + + "github.com/abadojack/whatlanggo" +) + +// EstimateReadingTime returns the estimated reading time of an article in minute. +func EstimateReadingTime(content string, defaultReadingSpeed, cjkReadingSpeed int) int { + sanitizedContent := sanitizer.StripTags(content) + langInfo := whatlanggo.Detect(sanitizedContent) + + var timeToReadInt int + if langInfo.IsReliable() && (langInfo.Lang == whatlanggo.Jpn || langInfo.Lang == whatlanggo.Cmn || langInfo.Lang == whatlanggo.Kor) { + timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(cjkReadingSpeed))) + } else { + nbOfWords := len(strings.Fields(sanitizedContent)) + timeToReadInt = int(math.Ceil(float64(nbOfWords) / float64(defaultReadingSpeed))) + } + + return timeToReadInt +} diff --git a/internal/reader/readingtime/readingtime_test.go b/internal/reader/readingtime/readingtime_test.go new file mode 100644 index 00000000..4915c7cb --- /dev/null +++ b/internal/reader/readingtime/readingtime_test.go @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package readingtime + +import "testing" + +func TestEstimateReadingTimeInEnglish(t *testing.T) { + sampleText := ` + In turpis lacus, sollicitudin non accumsan sed, suscipit eget magna. Morbi id + neque enim. Aenean ac lacus consectetur, accumsan elit ac, suscipit dui. Donec + congue mi et nisl bibendum, venenatis fringilla orci tristique. Nullam ullamcorper + cursus justo, ac iaculis ante euismod a. Fusce dapibus lacus arcu, consectetur + porttitor odio finibus ac. Integer dictum faucibus egestas. Etiam magna diam, placerat + sed velit vitae, lobortis accumsan nisi. Sed viverra dui in odio commodo dapibus. + Sed pulvinar metus finibus, hendrerit diam eu, faucibus lectus. Mauris est tellus, + convallis et velit sit amet, convallis sagittis nunc. Quisque at ex leo. Donec eget leo + vel nibh porta molestie. Aenean pellentesque purus non laoreet aliquam. + + In feugiat eget arcu nec sodales. Nunc rutrum felis in tellus venenatis, sit + amet tincidunt augue varius. Nunc nec dignissim quam. In euismod gravida rhoncus. + Vivamus eget nibh sed diam malesuada facilisis. Donec ac convallis elit. Fusce + fermentum tincidunt est. Nunc viverra, eros in gravida convallis, ex augue vehicula + magna, sed tincidunt metus sem et mauris. In pretium purus odio, a auctor tellus + ornare vel. Donec ac dolor pulvinar, placerat elit eget, ultrices nisi. Donec + tincidunt magna eget pretium sodales. In urna lorem, consectetur in fringilla eget, + rutrum et erat. Proin fringilla, lectus eget commodo consequat, est massa lacinia + lorem, ut ultricies nunc erat id sapien. + + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce fermentum id + sem sed commodo. Ut eget mauris eu lectus mollis aliquam. Fusce convallis, quam + vel volutpat aliquet, nunc sem rhoncus magna, a iaculis enim ex nec neque. + Suspendisse vel imperdiet leo. Quisque ultrices semper commodo. Pellentesque nec libero et + mauris gravida porta vitae id nunc. Fusce sed sem sed augue gravida ultricies at nec + turpis. Sed semper eu urna sit amet malesuada. Suspendisse blandit condimentum elit, + in scelerisque tellus convallis eu. Nunc eleifend sem et mauris vestibulum + mattis. Praesent ultricies pellentesque eros non posuere. + ` + + readingTime := EstimateReadingTime(sampleText, 200, 500) + if readingTime != 2 { + t.Errorf(`Wrong reading time, got %d instead of 2`, readingTime) + } +} + +func TestEstimateReadingTimeInChinese(t *testing.T) { + sampleText := ` + 労問委格名町違載式新青脂通由。割止書円画民京般著治登門画拡下。有国同観教田美森素説砂者徴多。上治速相支存色分繰年活元事集遣逆山。身消年森発世財間世変悲原記潟旅好手真今。現通浪口特愛始信川節身方一表著購。郁不使権草定内防並要更一条露加。載交源図訴際属年券重供健三洗。事北残却女鮎朝分要廷込宣政愛無投事。 + + 問警技亮参沼洗請米物模人。誰探重午局新戦報投性病庭。典向載問千著書故表視新権最石車音端乏大。白僚三掲局係仕表広無旧見要最裁。額寄済生年余講前本次載隊劇。権成観始応泉早高拓了経地本稼室目犯井出。暮載必広傷内校岡公南散広転行別釈。康運行関本掲隠泉傷退報告。独変年換差取予口男旅挑講禁姿。出芳工類胸管払時済潟髪内豊。 + + 康浴部問玲玉追球化就店岡問画路投。施先太業阪能敏所陸不供探掲方用。手右演社援発示竹育対橋除際愛功旬転好使公。利時改本項輸属嘆員複携者地剤。天政朝戸祝言月接住世黙極者議編連。囲淑覧重弾必治物健賄開頂外称豊開名銀戸院。政稿調励廃演手生告題営味董演何南峰貨。学横公得行提大品回猿齢利込家前役把煎。天代者内身慢作業署間地日。 + + 中個興本広坂態掲神中能等無滞長対。号処月画界意気様党目購栃欠歌暮。一耳供意盛四俊健必財下画例本判著堺要北王。宮大攻人水一備治首闘振円分建前趣校。目少供午見掲岡安画入情薦続土世始。診読格七久改急目斉実配正。性止月模多様更社発掲雪奇芸量全兵経負。予転済反問止下生買再無旅的。模治明以共会必華浅知館版領送。 + ` + + readingTime := EstimateReadingTime(sampleText, 200, 500) + if readingTime != 2 { + t.Errorf(`Wrong reading time, got %d instead of 2`, readingTime) + } +}