Calculate reading time during feed processing

The goal is to speed up the user interface.

Detecting the language based on the content is pretty slow.
This commit is contained in:
Frédéric Guillot 2020-11-18 17:29:40 -08:00 committed by fguillot
parent b1c9977711
commit de7a613098
12 changed files with 84 additions and 50 deletions

View File

@ -129,20 +129,21 @@ type Feeds []*Feed
// Entry represents a subscription item in the system.
type Entry struct {
ID int64 `json:"id"`
UserID int64 `json:"user_id"`
FeedID int64 `json:"feed_id"`
Status string `json:"status"`
Hash string `json:"hash"`
Title string `json:"title"`
URL string `json:"url"`
Date time.Time `json:"published_at"`
Content string `json:"content"`
Author string `json:"author"`
ShareCode string `json:"share_code"`
Starred bool `json:"starred"`
Enclosures Enclosures `json:"enclosures,omitempty"`
Feed *Feed `json:"feed,omitempty"`
ID int64 `json:"id"`
UserID int64 `json:"user_id"`
FeedID int64 `json:"feed_id"`
Status string `json:"status"`
Hash string `json:"hash"`
Title string `json:"title"`
URL string `json:"url"`
Date time.Time `json:"published_at"`
Content string `json:"content"`
Author string `json:"author"`
ShareCode string `json:"share_code"`
Starred bool `json:"starred"`
ReadingTime int `json:"reading_time"`
Enclosures Enclosures `json:"enclosures,omitempty"`
Feed *Feed `json:"feed,omitempty"`
}
// Entries represents a list of entries.

View File

@ -12,7 +12,7 @@ import (
"miniflux.app/logger"
)
const schemaVersion = 40
const schemaVersion = 41
// Migrate executes database migrations.
func Migrate(db *sql.DB) {

View File

@ -203,6 +203,7 @@ alter table users add column entry_direction entry_sorting_direction default 'as
add column keeplist_rules text not null default ''
;
`,
"schema_version_41": `alter table entries add column reading_time int not null default 0;`,
"schema_version_5": `create table integrations (
user_id int not null,
pinboard_enabled bool default 'f',
@ -264,6 +265,7 @@ var SqlMapChecksums = map[string]string{
"schema_version_39": "b0f90b97502921d4681a07c64d180a91a0b4ccac7d3c1dbe30519ad6f1bf1737",
"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
"schema_version_40": "6a8fec92399f853ed6817aff4cfa43255dce4c19afad796e41519d09de62105e",
"schema_version_41": "128e118ce61267ea1f6ae03b63a6d4734eae87e520b00e309ad083f1f6afdfe5",
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
"schema_version_7": "33f298c9aa30d6de3ca28e1270df51c2884d7596f1283a75716e2aeb634cd05c",

View File

@ -0,0 +1 @@
alter table entries add column reading_time int not null default 0;

View File

@ -33,6 +33,7 @@ type Entry struct {
Author string `json:"author"`
ShareCode string `json:"share_code"`
Starred bool `json:"starred"`
ReadingTime int `json:"reading_time"`
Enclosures EnclosureList `json:"enclosures,omitempty"`
Feed *Feed `json:"feed,omitempty"`
}

View File

@ -5,8 +5,11 @@
package processor
import (
"math"
"regexp"
"strings"
"time"
"unicode/utf8"
"miniflux.app/config"
"miniflux.app/logger"
@ -16,6 +19,8 @@ import (
"miniflux.app/reader/sanitizer"
"miniflux.app/reader/scraper"
"miniflux.app/storage"
"github.com/rylans/getlang"
)
// ProcessFeedEntries downloads original web page for entries and apply filters.
@ -58,6 +63,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.ReadingTime = calculateReadingTime(entry.Content)
filteredEntries = append(filteredEntries, entry)
}
@ -108,7 +114,23 @@ func ProcessEntryWebPage(entry *model.Entry) error {
if content != "" {
entry.Content = content
entry.ReadingTime = calculateReadingTime(content)
}
return nil
}
func calculateReadingTime(content string) int {
sanitizedContent := sanitizer.StripTags(content)
languageInfo := getlang.FromString(sanitizedContent)
var timeToReadInt int
if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" {
timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500))
} else {
nbOfWords := len(strings.Fields(sanitizedContent))
timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265))
}
return timeToReadInt
}

View File

@ -75,11 +75,11 @@ func (s *Storage) UpdateEntryContent(entry *model.Entry) error {
UPDATE
entries
SET
content=$1
content=$1, reading_time=$2
WHERE
id=$2 AND user_id=$3
id=$3 AND user_id=$4
`
_, err = tx.Exec(query, entry.Content, entry.ID, entry.UserID)
_, err = tx.Exec(query, entry.Content, entry.ReadingTime, entry.ID, entry.UserID)
if err != nil {
tx.Rollback()
return fmt.Errorf(`store: unable to update content of entry #%d: %v`, entry.ID, err)
@ -106,9 +106,35 @@ func (s *Storage) UpdateEntryContent(entry *model.Entry) error {
func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
query := `
INSERT INTO entries
(title, hash, url, comments_url, published_at, content, author, user_id, feed_id, changed_at, document_vectors)
(
title,
hash,
url,
comments_url,
published_at,
content,
author,
user_id,
feed_id,
reading_time,
changed_at,
document_vectors
)
VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, now(), setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($6, '') for 1000000)), 'B'))
(
$1,
$2,
$3,
$4,
$5,
$6,
$7,
$8,
$9,
$10,
now(),
setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($6, '') for 1000000)), 'B')
)
RETURNING
id, status
`
@ -123,6 +149,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
entry.Author,
entry.UserID,
entry.FeedID,
entry.ReadingTime,
).Scan(&entry.ID, &entry.Status)
if err != nil {
@ -154,9 +181,10 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
comments_url=$3,
content=$4,
author=$5,
reading_time=$6,
document_vectors = setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($4, '') for 1000000)), 'B')
WHERE
user_id=$6 AND feed_id=$7 AND hash=$8
user_id=$7 AND feed_id=$8 AND hash=$9
RETURNING
id
`
@ -167,6 +195,7 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
entry.CommentsURL,
entry.Content,
entry.Author,
entry.ReadingTime,
entry.UserID,
entry.FeedID,
entry.Hash,

View File

@ -226,6 +226,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
e.content,
e.status,
e.starred,
e.reading_time,
f.title as feed_title,
f.feed_url,
f.site_url,
@ -284,6 +285,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
&entry.Content,
&entry.Status,
&entry.Starred,
&entry.ReadingTime,
&entry.Feed.Title,
&entry.Feed.FeedURL,
&entry.Feed.SiteURL,

View File

@ -242,10 +242,10 @@ SOFTWARE.
<li>
<time datetime="{{ isodate .entry.Date }}" title="{{ isodate .entry.Date }}">{{ elapsed .user.Timezone .entry.Date }}</time>
</li>
{{ if .user.ShowReadingTime }}
{{ if and .user.ShowReadingTime (gt .entry.ReadingTime 0) }}
<li>
<span>
{{ plural "entry.estimated_reading_time" (timeToRead .entry.Content) (timeToRead .entry.Content) }}
{{ plural "entry.estimated_reading_time" .entry.ReadingTime .entry.ReadingTime }}
</span>
</li>
{{ end }}
@ -523,7 +523,7 @@ var templateCommonMapChecksums = map[string]string{
"feed_list": "931e43d328a116318c510de5658c688cd940b934c86b6ec82a472e1f81e020ae",
"feed_menu": "318d8662dda5ca9dfc75b909c8461e79c86fb5082df1428f67aaf856f19f4b50",
"icons": "9a41753778072f286216085d8712495e2ccca20c7a24f5c982775436a3d38579",
"item_meta": "eb72c6e2a924759af20b8ef41f2ce7495aedc053181c2e5ca1b063f9410c58b0",
"item_meta": "56ab09d7dd46eeb2e2ee11ddcec0c157a5832c896dbd2887d9e2b013680b2af6",
"layout": "65767e7dbebe1f7ed42895ecd5a737b0693e4a2ec35e84e3e391f462beb11977",
"pagination": "7b61288e86283c4cf0dc83bcbf8bf1c00c7cb29e60201c8c0b633b2450d2911f",
"settings_menu": "e2b777630c0efdbc529800303c01d6744ed3af80ec505ac5a5b3f99c9b989156",

View File

@ -65,9 +65,6 @@ func (e *Engine) Render(name, language string, data interface{}) []byte {
"plural": func(key string, n int, args ...interface{}) string {
return printer.Plural(key, n, args...)
},
"timeToRead": func(content string) int {
return timeToRead(content)
},
})
var b bytes.Buffer

View File

@ -11,19 +11,16 @@ import (
"net/mail"
"strings"
"time"
"unicode/utf8"
"miniflux.app/config"
"miniflux.app/http/route"
"miniflux.app/locale"
"miniflux.app/model"
"miniflux.app/proxy"
"miniflux.app/reader/sanitizer"
"miniflux.app/timezone"
"miniflux.app/url"
"github.com/gorilla/mux"
"github.com/rylans/getlang"
)
type funcMap struct {
@ -94,9 +91,6 @@ func (f *funcMap) Map() template.FuncMap {
"plural": func(key string, n int, args ...interface{}) string {
return ""
},
"timeToRead": func(content string) int {
return 0
},
}
}
@ -195,18 +189,3 @@ func formatFileSize(b int64) string {
return fmt.Sprintf("%.1f %ciB",
float64(b)/float64(div), "KMGTPE"[exp])
}
func timeToRead(content string) int {
sanitizedContent := sanitizer.StripTags(content)
languageInfo := getlang.FromString(sanitizedContent)
var timeToReadInt int
if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" {
timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500))
} else {
nbOfWords := len(strings.Fields(sanitizedContent))
timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265))
}
return timeToReadInt
}

View File

@ -7,10 +7,10 @@
<li>
<time datetime="{{ isodate .entry.Date }}" title="{{ isodate .entry.Date }}">{{ elapsed .user.Timezone .entry.Date }}</time>
</li>
{{ if .user.ShowReadingTime }}
{{ if and .user.ShowReadingTime (gt .entry.ReadingTime 0) }}
<li>
<span>
{{ plural "entry.estimated_reading_time" (timeToRead .entry.Content) (timeToRead .entry.Content) }}
{{ plural "entry.estimated_reading_time" .entry.ReadingTime .entry.ReadingTime }}
</span>
</li>
{{ end }}