diff --git a/postsFuncs.go b/postsFuncs.go
index 5d5fee8..93f528d 100644
--- a/postsFuncs.go
+++ b/postsFuncs.go
@@ -8,7 +8,6 @@ import (
"time"
gogeouri "git.jlel.se/jlelse/go-geouri"
- "github.com/PuerkitoBio/goquery"
"github.com/araddon/dateparse"
"gopkg.in/yaml.v3"
)
@@ -113,11 +112,9 @@ func (a *goBlog) postSummary(p *post) (summary string) {
}
html := string(a.postHtml(p, false))
if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(splitted[0]))
- summary = doc.Text()
+ summary = htmlText(splitted[0])
} else {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
- summary = doc.Find("p").First().Text()
+ summary = strings.Split(htmlText(html), "\n\n")[0]
}
return
}
diff --git a/tts.go b/tts.go
index c2e0dae..7a9915b 100644
--- a/tts.go
+++ b/tts.go
@@ -64,7 +64,7 @@ func (a *goBlog) createPostTTSAudio(p *post) error {
ssml.WriteString("")
ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
ssml.WriteString("")
- ssml.WriteString(html.EscapeString(cleanHTMLText(string(a.postHtml(p, false)))))
+ ssml.WriteString(html.EscapeString(htmlText(string(a.postHtml(p, false)))))
ssml.WriteString("")
// Generate audio
diff --git a/utils.go b/utils.go
index 8c91ac4..57b2eb4 100644
--- a/utils.go
+++ b/utils.go
@@ -246,24 +246,48 @@ func mBytesString(size int64) string {
}
func htmlText(s string) string {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(s))
+ // Build policy to only allow a subset of HTML tags
+ textPolicy := bluemonday.StrictPolicy()
+ textPolicy.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") // Headers
+ textPolicy.AllowElements("p") // Paragraphs
+ textPolicy.AllowElements("ol", "ul", "li") // Lists
+ textPolicy.AllowElements("blockquote") // Blockquotes
+ // Filter HTML tags
+ htmlBuf := textPolicy.SanitizeReader(strings.NewReader(s))
+ // Read HTML into document
+ doc, _ := goquery.NewDocumentFromReader(htmlBuf)
var text strings.Builder
- paragraphs := doc.Find("p")
- if paragraphs.Length() == 0 {
- text.WriteString(doc.Text())
+ if bodyChild := doc.Find("body").Children(); bodyChild.Length() > 0 {
+ // Input was real HTML, so build the text from the body
+ // Declare recursive function to print childs
+ var printChilds func(childs *goquery.Selection)
+ printChilds = func(childs *goquery.Selection) {
+ childs.Each(func(i int, sel *goquery.Selection) {
+ if i > 0 && // Not first child
+ sel.Is("h1, h2, h3, h4, h5, h6, p, ol, ul, li, blockquote") { // All elements that start a new paragraph
+ text.WriteString("\n\n")
+ }
+ if sel.Is("ol > li") { // List item in ordered list
+ fmt.Fprintf(&text, "%d. ", i+1) // Add list item number
+ }
+ if sel.Children().Length() > 0 { // Has children
+ printChilds(sel.Children()) // Recursive call to print childs
+ } else {
+ text.WriteString(sel.Text()) // Print text
+ }
+ })
+ }
+ printChilds(bodyChild)
} else {
- paragraphs.Each(func(i int, s *goquery.Selection) {
- if i > 0 {
- text.WriteString("\n\n")
- }
- text.WriteString(s.Text())
- })
+ // Input was probably just text, so just use the text
+ text.WriteString(doc.Text())
}
- r := strings.TrimSpace(text.String())
- return r
+ // Trim whitespace and return
+ return strings.TrimSpace(text.String())
}
func cleanHTMLText(s string) string {
+ // Clean HTML with UGC policy and return text
return htmlText(bluemonday.UGCPolicy().Sanitize(s))
}
diff --git a/utils_test.go b/utils_test.go
index 87aeda7..d458966 100644
--- a/utils_test.go
+++ b/utils_test.go
@@ -76,13 +76,33 @@ func Test_urlHasExt(t *testing.T) {
})
}
-func Test_cleanHTMLText(t *testing.T) {
- assert.Equal(t, `"This is a 'test'" 😁`, cleanHTMLText(`"This is a 'test'" 😁`))
- assert.Equal(t, `Test`, cleanHTMLText(`Test`))
- assert.Equal(t, "Test\n\nTest", cleanHTMLText(`
Test
Test
`))
- assert.Equal(t, "Test\n\nTest", cleanHTMLText("Test
\nTest
"))
- assert.Equal(t, "Test\n\nTest", cleanHTMLText(""))
- assert.Equal(t, "Test test\n\nTest", cleanHTMLText(`Test test
Test
`))
+func Test_htmlText(t *testing.T) {
+ // Text without HTML
+ assert.Equal(t, "This is a test", htmlText("This is a test"))
+ // Text without HTML and Emojis
+ assert.Equal(t, "This is a test 😁", htmlText("This is a test 😁"))
+ // Text without HTML and quoutes
+ assert.Equal(t, "This is a 'test'", htmlText("This is a 'test'"))
+ // Text with formatting (like bold or italic)
+ assert.Equal(t, "This is a test", htmlText("This is a test"))
+ assert.Equal(t, "This is a test", htmlText("This is a test"))
+ // Unordered list
+ assert.Equal(t, "Test\n\nTest", htmlText(``))
+ // Ordered list
+ assert.Equal(t, "1. Test\n\n2. Test", htmlText(`- Test
- Test
`))
+ // Nested unordered list
+ assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(``))
+ // Headers and paragraphs
+ assert.Equal(t, "Test\n\nTest", htmlText(`Test
Test
`))
+ assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`Test
Test
Test
`))
+ // Blockquote
+ assert.Equal(t, "Test\n\nBlockqoute content", htmlText(`Test
Blockqoute content
`))
+ // Nested blockquotes
+ assert.Equal(t, "Blockqoute content\n\nBlockqoute content", htmlText(`Blockqoute content
Blockqoute content
`))
+ // Code (should be ignored)
+ assert.Equal(t, "Test", htmlText(`Test
Code content
`))
+ // Inline code (should not be ignored)
+ assert.Equal(t, "Test Code content", htmlText(`Test Code content
`))
}
func Test_containsStrings(t *testing.T) {