From df5098b4c0a01a05c175320c65d50d3d27105f92 Mon Sep 17 00:00:00 2001 From: Jan-Lukas Else Date: Tue, 4 Jan 2022 18:15:09 +0100 Subject: [PATCH] Fix and improve htmlText util method for TTS and use it for summary as well --- postsFuncs.go | 7 ++----- tts.go | 2 +- utils.go | 48 ++++++++++++++++++++++++++++++++++++------------ utils_test.go | 34 +++++++++++++++++++++++++++------- 4 files changed, 66 insertions(+), 25 deletions(-) diff --git a/postsFuncs.go b/postsFuncs.go index 5d5fee8..93f528d 100644 --- a/postsFuncs.go +++ b/postsFuncs.go @@ -8,7 +8,6 @@ import ( "time" gogeouri "git.jlel.se/jlelse/go-geouri" - "github.com/PuerkitoBio/goquery" "github.com/araddon/dateparse" "gopkg.in/yaml.v3" ) @@ -113,11 +112,9 @@ func (a *goBlog) postSummary(p *post) (summary string) { } html := string(a.postHtml(p, false)) if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 { - doc, _ := goquery.NewDocumentFromReader(strings.NewReader(splitted[0])) - summary = doc.Text() + summary = htmlText(splitted[0]) } else { - doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html)) - summary = doc.Find("p").First().Text() + summary = strings.Split(htmlText(html), "\n\n")[0] } return } diff --git a/tts.go b/tts.go index c2e0dae..7a9915b 100644 --- a/tts.go +++ b/tts.go @@ -64,7 +64,7 @@ func (a *goBlog) createPostTTSAudio(p *post) error { ssml.WriteString("") ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title()))) ssml.WriteString("") - ssml.WriteString(html.EscapeString(cleanHTMLText(string(a.postHtml(p, false))))) + ssml.WriteString(html.EscapeString(htmlText(string(a.postHtml(p, false))))) ssml.WriteString("") // Generate audio diff --git a/utils.go b/utils.go index 8c91ac4..57b2eb4 100644 --- a/utils.go +++ b/utils.go @@ -246,24 +246,48 @@ func mBytesString(size int64) string { } func htmlText(s string) string { - doc, _ := goquery.NewDocumentFromReader(strings.NewReader(s)) + // Build policy to only allow a subset of HTML tags + textPolicy := bluemonday.StrictPolicy() + textPolicy.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") // Headers + textPolicy.AllowElements("p") // Paragraphs + textPolicy.AllowElements("ol", "ul", "li") // Lists + textPolicy.AllowElements("blockquote") // Blockquotes + // Filter HTML tags + htmlBuf := textPolicy.SanitizeReader(strings.NewReader(s)) + // Read HTML into document + doc, _ := goquery.NewDocumentFromReader(htmlBuf) var text strings.Builder - paragraphs := doc.Find("p") - if paragraphs.Length() == 0 { - text.WriteString(doc.Text()) + if bodyChild := doc.Find("body").Children(); bodyChild.Length() > 0 { + // Input was real HTML, so build the text from the body + // Declare recursive function to print childs + var printChilds func(childs *goquery.Selection) + printChilds = func(childs *goquery.Selection) { + childs.Each(func(i int, sel *goquery.Selection) { + if i > 0 && // Not first child + sel.Is("h1, h2, h3, h4, h5, h6, p, ol, ul, li, blockquote") { // All elements that start a new paragraph + text.WriteString("\n\n") + } + if sel.Is("ol > li") { // List item in ordered list + fmt.Fprintf(&text, "%d. ", i+1) // Add list item number + } + if sel.Children().Length() > 0 { // Has children + printChilds(sel.Children()) // Recursive call to print childs + } else { + text.WriteString(sel.Text()) // Print text + } + }) + } + printChilds(bodyChild) } else { - paragraphs.Each(func(i int, s *goquery.Selection) { - if i > 0 { - text.WriteString("\n\n") - } - text.WriteString(s.Text()) - }) + // Input was probably just text, so just use the text + text.WriteString(doc.Text()) } - r := strings.TrimSpace(text.String()) - return r + // Trim whitespace and return + return strings.TrimSpace(text.String()) } func cleanHTMLText(s string) string { + // Clean HTML with UGC policy and return text return htmlText(bluemonday.UGCPolicy().Sanitize(s)) } diff --git a/utils_test.go b/utils_test.go index 87aeda7..d458966 100644 --- a/utils_test.go +++ b/utils_test.go @@ -76,13 +76,33 @@ func Test_urlHasExt(t *testing.T) { }) } -func Test_cleanHTMLText(t *testing.T) { - assert.Equal(t, `"This is a 'test'" 😁`, cleanHTMLText(`"This is a 'test'" 😁`)) - assert.Equal(t, `Test`, cleanHTMLText(`Test`)) - assert.Equal(t, "Test\n\nTest", cleanHTMLText(`

Test

Test

`)) - assert.Equal(t, "Test\n\nTest", cleanHTMLText("

Test

\n

Test

")) - assert.Equal(t, "Test\n\nTest", cleanHTMLText("

Test

\n

Test

")) - assert.Equal(t, "Test test\n\nTest", cleanHTMLText(`

Test test

Test

`)) +func Test_htmlText(t *testing.T) { + // Text without HTML + assert.Equal(t, "This is a test", htmlText("This is a test")) + // Text without HTML and Emojis + assert.Equal(t, "This is a test 😁", htmlText("This is a test 😁")) + // Text without HTML and quoutes + assert.Equal(t, "This is a 'test'", htmlText("This is a 'test'")) + // Text with formatting (like bold or italic) + assert.Equal(t, "This is a test", htmlText("This is a test")) + assert.Equal(t, "This is a test", htmlText("This is a test")) + // Unordered list + assert.Equal(t, "Test\n\nTest", htmlText(``)) + // Ordered list + assert.Equal(t, "1. Test\n\n2. Test", htmlText(`
  1. Test
  2. Test
`)) + // Nested unordered list + assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(``)) + // Headers and paragraphs + assert.Equal(t, "Test\n\nTest", htmlText(`

Test

Test

`)) + assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`

Test

Test

Test

`)) + // Blockquote + assert.Equal(t, "Test\n\nBlockqoute content", htmlText(`

Test

Blockqoute content

`)) + // Nested blockquotes + assert.Equal(t, "Blockqoute content\n\nBlockqoute content", htmlText(`

Blockqoute content

Blockqoute content

`)) + // Code (should be ignored) + assert.Equal(t, "Test", htmlText(`

Test

Code content
`)) + // Inline code (should not be ignored) + assert.Equal(t, "Test Code content", htmlText(`

Test Code content

`)) } func Test_containsStrings(t *testing.T) {