mirror of https://github.com/jlelse/GoBlog
Fix and improve htmlText util method for TTS and use it for summary as well
This commit is contained in:
parent
d953b331c4
commit
df5098b4c0
|
@ -8,7 +8,6 @@ import (
|
|||
"time"
|
||||
|
||||
gogeouri "git.jlel.se/jlelse/go-geouri"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/araddon/dateparse"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
@ -113,11 +112,9 @@ func (a *goBlog) postSummary(p *post) (summary string) {
|
|||
}
|
||||
html := string(a.postHtml(p, false))
|
||||
if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 {
|
||||
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(splitted[0]))
|
||||
summary = doc.Text()
|
||||
summary = htmlText(splitted[0])
|
||||
} else {
|
||||
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
summary = doc.Find("p").First().Text()
|
||||
summary = strings.Split(htmlText(html), "\n\n")[0]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
2
tts.go
2
tts.go
|
@ -64,7 +64,7 @@ func (a *goBlog) createPostTTSAudio(p *post) error {
|
|||
ssml.WriteString("<speak>")
|
||||
ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
|
||||
ssml.WriteString("<break time=\"1s\"/>")
|
||||
ssml.WriteString(html.EscapeString(cleanHTMLText(string(a.postHtml(p, false)))))
|
||||
ssml.WriteString(html.EscapeString(htmlText(string(a.postHtml(p, false)))))
|
||||
ssml.WriteString("</speak>")
|
||||
|
||||
// Generate audio
|
||||
|
|
44
utils.go
44
utils.go
|
@ -246,24 +246,48 @@ func mBytesString(size int64) string {
|
|||
}
|
||||
|
||||
func htmlText(s string) string {
|
||||
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(s))
|
||||
// Build policy to only allow a subset of HTML tags
|
||||
textPolicy := bluemonday.StrictPolicy()
|
||||
textPolicy.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") // Headers
|
||||
textPolicy.AllowElements("p") // Paragraphs
|
||||
textPolicy.AllowElements("ol", "ul", "li") // Lists
|
||||
textPolicy.AllowElements("blockquote") // Blockquotes
|
||||
// Filter HTML tags
|
||||
htmlBuf := textPolicy.SanitizeReader(strings.NewReader(s))
|
||||
// Read HTML into document
|
||||
doc, _ := goquery.NewDocumentFromReader(htmlBuf)
|
||||
var text strings.Builder
|
||||
paragraphs := doc.Find("p")
|
||||
if paragraphs.Length() == 0 {
|
||||
text.WriteString(doc.Text())
|
||||
} else {
|
||||
paragraphs.Each(func(i int, s *goquery.Selection) {
|
||||
if i > 0 {
|
||||
if bodyChild := doc.Find("body").Children(); bodyChild.Length() > 0 {
|
||||
// Input was real HTML, so build the text from the body
|
||||
// Declare recursive function to print childs
|
||||
var printChilds func(childs *goquery.Selection)
|
||||
printChilds = func(childs *goquery.Selection) {
|
||||
childs.Each(func(i int, sel *goquery.Selection) {
|
||||
if i > 0 && // Not first child
|
||||
sel.Is("h1, h2, h3, h4, h5, h6, p, ol, ul, li, blockquote") { // All elements that start a new paragraph
|
||||
text.WriteString("\n\n")
|
||||
}
|
||||
text.WriteString(s.Text())
|
||||
if sel.Is("ol > li") { // List item in ordered list
|
||||
fmt.Fprintf(&text, "%d. ", i+1) // Add list item number
|
||||
}
|
||||
if sel.Children().Length() > 0 { // Has children
|
||||
printChilds(sel.Children()) // Recursive call to print childs
|
||||
} else {
|
||||
text.WriteString(sel.Text()) // Print text
|
||||
}
|
||||
})
|
||||
}
|
||||
r := strings.TrimSpace(text.String())
|
||||
return r
|
||||
printChilds(bodyChild)
|
||||
} else {
|
||||
// Input was probably just text, so just use the text
|
||||
text.WriteString(doc.Text())
|
||||
}
|
||||
// Trim whitespace and return
|
||||
return strings.TrimSpace(text.String())
|
||||
}
|
||||
|
||||
func cleanHTMLText(s string) string {
|
||||
// Clean HTML with UGC policy and return text
|
||||
return htmlText(bluemonday.UGCPolicy().Sanitize(s))
|
||||
}
|
||||
|
||||
|
|
|
@ -76,13 +76,33 @@ func Test_urlHasExt(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func Test_cleanHTMLText(t *testing.T) {
|
||||
assert.Equal(t, `"This is a 'test'" 😁`, cleanHTMLText(`"This is a 'test'" 😁`))
|
||||
assert.Equal(t, `Test`, cleanHTMLText(`<b>Test</b>`))
|
||||
assert.Equal(t, "Test\n\nTest", cleanHTMLText(`<p>Test</p><p>Test</p>`))
|
||||
assert.Equal(t, "Test\n\nTest", cleanHTMLText("<p>Test</p>\n<p>Test</p>"))
|
||||
assert.Equal(t, "Test\n\nTest", cleanHTMLText("<div><p>Test</p>\n<p>Test</p></div>"))
|
||||
assert.Equal(t, "Test test\n\nTest", cleanHTMLText(`<p>Test <b>test</b></p><p>Test</p>`))
|
||||
func Test_htmlText(t *testing.T) {
|
||||
// Text without HTML
|
||||
assert.Equal(t, "This is a test", htmlText("This is a test"))
|
||||
// Text without HTML and Emojis
|
||||
assert.Equal(t, "This is a test 😁", htmlText("This is a test 😁"))
|
||||
// Text without HTML and quoutes
|
||||
assert.Equal(t, "This is a 'test'", htmlText("This is a 'test'"))
|
||||
// Text with formatting (like bold or italic)
|
||||
assert.Equal(t, "This is a test", htmlText("<b>This is a test</b>"))
|
||||
assert.Equal(t, "This is a test", htmlText("<i>This is a test</i>"))
|
||||
// Unordered list
|
||||
assert.Equal(t, "Test\n\nTest", htmlText(`<ul><li>Test</li><li>Test</li></ul>`))
|
||||
// Ordered list
|
||||
assert.Equal(t, "1. Test\n\n2. Test", htmlText(`<ol><li>Test</li><li>Test</li></ol>`))
|
||||
// Nested unordered list
|
||||
assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<ul><li>Test</li><li><ul><li>Test</li><li>Test</li></ul></li></ul>`))
|
||||
// Headers and paragraphs
|
||||
assert.Equal(t, "Test\n\nTest", htmlText(`<h1>Test</h1><p>Test</p>`))
|
||||
assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<h1>Test</h1><p>Test</p><h2>Test</h2>`))
|
||||
// Blockquote
|
||||
assert.Equal(t, "Test\n\nBlockqoute content", htmlText(`<p>Test</p><blockquote><p>Blockqoute content</p></blockquote>`))
|
||||
// Nested blockquotes
|
||||
assert.Equal(t, "Blockqoute content\n\nBlockqoute content", htmlText(`<blockquote><p>Blockqoute content</p><blockquote><p>Blockqoute content</p></blockquote></blockquote>`))
|
||||
// Code (should be ignored)
|
||||
assert.Equal(t, "Test", htmlText(`<p>Test</p><pre><code>Code content</code></pre>`))
|
||||
// Inline code (should not be ignored)
|
||||
assert.Equal(t, "Test Code content", htmlText(`<p>Test <code>Code content</code></p>`))
|
||||
}
|
||||
|
||||
func Test_containsStrings(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue