Fix and improve htmlText util method for TTS and use it for summary as well

This commit is contained in:
Jan-Lukas Else 2022-01-04 18:15:09 +01:00
parent d953b331c4
commit df5098b4c0
4 changed files with 66 additions and 25 deletions

View File

@ -8,7 +8,6 @@ import (
"time"
gogeouri "git.jlel.se/jlelse/go-geouri"
"github.com/PuerkitoBio/goquery"
"github.com/araddon/dateparse"
"gopkg.in/yaml.v3"
)
@ -113,11 +112,9 @@ func (a *goBlog) postSummary(p *post) (summary string) {
}
html := string(a.postHtml(p, false))
if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 {
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(splitted[0]))
summary = doc.Text()
summary = htmlText(splitted[0])
} else {
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
summary = doc.Find("p").First().Text()
summary = strings.Split(htmlText(html), "\n\n")[0]
}
return
}

2
tts.go
View File

@ -64,7 +64,7 @@ func (a *goBlog) createPostTTSAudio(p *post) error {
ssml.WriteString("<speak>")
ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
ssml.WriteString("<break time=\"1s\"/>")
ssml.WriteString(html.EscapeString(cleanHTMLText(string(a.postHtml(p, false)))))
ssml.WriteString(html.EscapeString(htmlText(string(a.postHtml(p, false)))))
ssml.WriteString("</speak>")
// Generate audio

View File

@ -246,24 +246,48 @@ func mBytesString(size int64) string {
}
func htmlText(s string) string {
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(s))
// Build policy to only allow a subset of HTML tags
textPolicy := bluemonday.StrictPolicy()
textPolicy.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") // Headers
textPolicy.AllowElements("p") // Paragraphs
textPolicy.AllowElements("ol", "ul", "li") // Lists
textPolicy.AllowElements("blockquote") // Blockquotes
// Filter HTML tags
htmlBuf := textPolicy.SanitizeReader(strings.NewReader(s))
// Read HTML into document
doc, _ := goquery.NewDocumentFromReader(htmlBuf)
var text strings.Builder
paragraphs := doc.Find("p")
if paragraphs.Length() == 0 {
text.WriteString(doc.Text())
if bodyChild := doc.Find("body").Children(); bodyChild.Length() > 0 {
// Input was real HTML, so build the text from the body
// Declare recursive function to print childs
var printChilds func(childs *goquery.Selection)
printChilds = func(childs *goquery.Selection) {
childs.Each(func(i int, sel *goquery.Selection) {
if i > 0 && // Not first child
sel.Is("h1, h2, h3, h4, h5, h6, p, ol, ul, li, blockquote") { // All elements that start a new paragraph
text.WriteString("\n\n")
}
if sel.Is("ol > li") { // List item in ordered list
fmt.Fprintf(&text, "%d. ", i+1) // Add list item number
}
if sel.Children().Length() > 0 { // Has children
printChilds(sel.Children()) // Recursive call to print childs
} else {
text.WriteString(sel.Text()) // Print text
}
})
}
printChilds(bodyChild)
} else {
paragraphs.Each(func(i int, s *goquery.Selection) {
if i > 0 {
text.WriteString("\n\n")
}
text.WriteString(s.Text())
})
// Input was probably just text, so just use the text
text.WriteString(doc.Text())
}
r := strings.TrimSpace(text.String())
return r
// Trim whitespace and return
return strings.TrimSpace(text.String())
}
func cleanHTMLText(s string) string {
// Clean HTML with UGC policy and return text
return htmlText(bluemonday.UGCPolicy().Sanitize(s))
}

View File

@ -76,13 +76,33 @@ func Test_urlHasExt(t *testing.T) {
})
}
func Test_cleanHTMLText(t *testing.T) {
assert.Equal(t, `"This is a 'test'" 😁`, cleanHTMLText(`"This is a 'test'" 😁`))
assert.Equal(t, `Test`, cleanHTMLText(`<b>Test</b>`))
assert.Equal(t, "Test\n\nTest", cleanHTMLText(`<p>Test</p><p>Test</p>`))
assert.Equal(t, "Test\n\nTest", cleanHTMLText("<p>Test</p>\n<p>Test</p>"))
assert.Equal(t, "Test\n\nTest", cleanHTMLText("<div><p>Test</p>\n<p>Test</p></div>"))
assert.Equal(t, "Test test\n\nTest", cleanHTMLText(`<p>Test <b>test</b></p><p>Test</p>`))
func Test_htmlText(t *testing.T) {
// Text without HTML
assert.Equal(t, "This is a test", htmlText("This is a test"))
// Text without HTML and Emojis
assert.Equal(t, "This is a test 😁", htmlText("This is a test 😁"))
// Text without HTML and quoutes
assert.Equal(t, "This is a 'test'", htmlText("This is a 'test'"))
// Text with formatting (like bold or italic)
assert.Equal(t, "This is a test", htmlText("<b>This is a test</b>"))
assert.Equal(t, "This is a test", htmlText("<i>This is a test</i>"))
// Unordered list
assert.Equal(t, "Test\n\nTest", htmlText(`<ul><li>Test</li><li>Test</li></ul>`))
// Ordered list
assert.Equal(t, "1. Test\n\n2. Test", htmlText(`<ol><li>Test</li><li>Test</li></ol>`))
// Nested unordered list
assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<ul><li>Test</li><li><ul><li>Test</li><li>Test</li></ul></li></ul>`))
// Headers and paragraphs
assert.Equal(t, "Test\n\nTest", htmlText(`<h1>Test</h1><p>Test</p>`))
assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<h1>Test</h1><p>Test</p><h2>Test</h2>`))
// Blockquote
assert.Equal(t, "Test\n\nBlockqoute content", htmlText(`<p>Test</p><blockquote><p>Blockqoute content</p></blockquote>`))
// Nested blockquotes
assert.Equal(t, "Blockqoute content\n\nBlockqoute content", htmlText(`<blockquote><p>Blockqoute content</p><blockquote><p>Blockqoute content</p></blockquote></blockquote>`))
// Code (should be ignored)
assert.Equal(t, "Test", htmlText(`<p>Test</p><pre><code>Code content</code></pre>`))
// Inline code (should not be ignored)
assert.Equal(t, "Test Code content", htmlText(`<p>Test <code>Code content</code></p>`))
}
func Test_containsStrings(t *testing.T) {