From df5098b4c0a01a05c175320c65d50d3d27105f92 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Else <mail@jlelse.de>
Date: Tue, 4 Jan 2022 18:15:09 +0100
Subject: [PATCH] Fix and improve htmlText util method for TTS and use it for
 summary as well

---
 postsFuncs.go |  7 ++-----
 tts.go        |  2 +-
 utils.go      | 48 ++++++++++++++++++++++++++++++++++++------------
 utils_test.go | 34 +++++++++++++++++++++++++++-------
 4 files changed, 66 insertions(+), 25 deletions(-)
diff --git a/postsFuncs.go b/postsFuncs.go
index 5d5fee8..93f528d 100644
--- a/postsFuncs.go
+++ b/postsFuncs.go
@@ -8,7 +8,6 @@ import (
 	"time"
 
 	gogeouri "git.jlel.se/jlelse/go-geouri"
-	"github.com/PuerkitoBio/goquery"
 	"github.com/araddon/dateparse"
 	"gopkg.in/yaml.v3"
 )
@@ -113,11 +112,9 @@ func (a *goBlog) postSummary(p *post) (summary string) {
 	}
 	html := string(a.postHtml(p, false))
 	if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 {
-		doc, _ := goquery.NewDocumentFromReader(strings.NewReader(splitted[0]))
-		summary = doc.Text()
+		summary = htmlText(splitted[0])
 	} else {
-		doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
-		summary = doc.Find("p").First().Text()
+		summary = strings.Split(htmlText(html), "\n\n")[0]
 	}
 	return
 }
diff --git a/tts.go b/tts.go
index c2e0dae..7a9915b 100644
--- a/tts.go
+++ b/tts.go
@@ -64,7 +64,7 @@ func (a *goBlog) createPostTTSAudio(p *post) error {
 	ssml.WriteString("<speak>")
 	ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
 	ssml.WriteString("<break time=\"1s\"/>")
-	ssml.WriteString(html.EscapeString(cleanHTMLText(string(a.postHtml(p, false)))))
+	ssml.WriteString(html.EscapeString(htmlText(string(a.postHtml(p, false)))))
 	ssml.WriteString("</speak>")
 
 	// Generate audio
diff --git a/utils.go b/utils.go
index 8c91ac4..57b2eb4 100644
--- a/utils.go
+++ b/utils.go
@@ -246,24 +246,48 @@ func mBytesString(size int64) string {
 }
 
 func htmlText(s string) string {
-	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(s))
+	// Build policy to only allow a subset of HTML tags
+	textPolicy := bluemonday.StrictPolicy()
+	textPolicy.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") // Headers
+	textPolicy.AllowElements("p")                                // Paragraphs
+	textPolicy.AllowElements("ol", "ul", "li")                   // Lists
+	textPolicy.AllowElements("blockquote")                       // Blockquotes
+	// Filter HTML tags
+	htmlBuf := textPolicy.SanitizeReader(strings.NewReader(s))
+	// Read HTML into document
+	doc, _ := goquery.NewDocumentFromReader(htmlBuf)
 	var text strings.Builder
-	paragraphs := doc.Find("p")
-	if paragraphs.Length() == 0 {
-		text.WriteString(doc.Text())
+	if bodyChild := doc.Find("body").Children(); bodyChild.Length() > 0 {
+		// Input was real HTML, so build the text from the body
+		// Declare recursive function to print childs
+		var printChilds func(childs *goquery.Selection)
+		printChilds = func(childs *goquery.Selection) {
+			childs.Each(func(i int, sel *goquery.Selection) {
+				if i > 0 && // Not first child
+					sel.Is("h1, h2, h3, h4, h5, h6, p, ol, ul, li, blockquote") { // All elements that start a new paragraph
+					text.WriteString("\n\n")
+				}
+				if sel.Is("ol > li") { // List item in ordered list
+					fmt.Fprintf(&text, "%d. ", i+1) // Add list item number
+				}
+				if sel.Children().Length() > 0 { // Has children
+					printChilds(sel.Children()) // Recursive call to print childs
+				} else {
+					text.WriteString(sel.Text()) // Print text
+				}
+			})
+		}
+		printChilds(bodyChild)
 	} else {
-		paragraphs.Each(func(i int, s *goquery.Selection) {
-			if i > 0 {
-				text.WriteString("\n\n")
-			}
-			text.WriteString(s.Text())
-		})
+		// Input was probably just text, so just use the text
+		text.WriteString(doc.Text())
 	}
-	r := strings.TrimSpace(text.String())
-	return r
+	// Trim whitespace and return
+	return strings.TrimSpace(text.String())
 }
 
 func cleanHTMLText(s string) string {
+	// Clean HTML with UGC policy and return text
 	return htmlText(bluemonday.UGCPolicy().Sanitize(s))
 }
 
diff --git a/utils_test.go b/utils_test.go
index 87aeda7..d458966 100644
--- a/utils_test.go
+++ b/utils_test.go
@@ -76,13 +76,33 @@ func Test_urlHasExt(t *testing.T) {
 	})
 }
 
-func Test_cleanHTMLText(t *testing.T) {
-	assert.Equal(t, `"This is a 'test'" 😁`, cleanHTMLText(`"This is a 'test'" 😁`))
-	assert.Equal(t, `Test`, cleanHTMLText(`<b>Test</b>`))
-	assert.Equal(t, "Test\n\nTest", cleanHTMLText(`<p>Test</p><p>Test</p>`))
-	assert.Equal(t, "Test\n\nTest", cleanHTMLText("<p>Test</p>\n<p>Test</p>"))
-	assert.Equal(t, "Test\n\nTest", cleanHTMLText("<div><p>Test</p>\n<p>Test</p></div>"))
-	assert.Equal(t, "Test test\n\nTest", cleanHTMLText(`<p>Test <b>test</b></p><p>Test</p>`))
+func Test_htmlText(t *testing.T) {
+	// Text without HTML
+	assert.Equal(t, "This is a test", htmlText("This is a test"))
+	// Text without HTML and Emojis
+	assert.Equal(t, "This is a test 😁", htmlText("This is a test 😁"))
+	// Text without HTML and quoutes
+	assert.Equal(t, "This is a 'test'", htmlText("This is a 'test'"))
+	// Text with formatting (like bold or italic)
+	assert.Equal(t, "This is a test", htmlText("<b>This is a test</b>"))
+	assert.Equal(t, "This is a test", htmlText("<i>This is a test</i>"))
+	// Unordered list
+	assert.Equal(t, "Test\n\nTest", htmlText(`<ul><li>Test</li><li>Test</li></ul>`))
+	// Ordered list
+	assert.Equal(t, "1. Test\n\n2. Test", htmlText(`<ol><li>Test</li><li>Test</li></ol>`))
+	// Nested unordered list
+	assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<ul><li>Test</li><li><ul><li>Test</li><li>Test</li></ul></li></ul>`))
+	// Headers and paragraphs
+	assert.Equal(t, "Test\n\nTest", htmlText(`<h1>Test</h1><p>Test</p>`))
+	assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<h1>Test</h1><p>Test</p><h2>Test</h2>`))
+	// Blockquote
+	assert.Equal(t, "Test\n\nBlockqoute content", htmlText(`<p>Test</p><blockquote><p>Blockqoute content</p></blockquote>`))
+	// Nested blockquotes
+	assert.Equal(t, "Blockqoute content\n\nBlockqoute content", htmlText(`<blockquote><p>Blockqoute content</p><blockquote><p>Blockqoute content</p></blockquote></blockquote>`))
+	// Code (should be ignored)
+	assert.Equal(t, "Test", htmlText(`<p>Test</p><pre><code>Code content</code></pre>`))
+	// Inline code (should not be ignored)
+	assert.Equal(t, "Test Code content", htmlText(`<p>Test <code>Code content</code></p>`))
 }
 
 func Test_containsStrings(t *testing.T) {