mirror of https://github.com/jlelse/GoBlog
Fix and improve htmlText util method for TTS and use it for summary as well
This commit is contained in:
parent
d953b331c4
commit
df5098b4c0
|
@ -8,7 +8,6 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
gogeouri "git.jlel.se/jlelse/go-geouri"
|
gogeouri "git.jlel.se/jlelse/go-geouri"
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"github.com/araddon/dateparse"
|
"github.com/araddon/dateparse"
|
||||||
"gopkg.in/yaml.v3"
|
"gopkg.in/yaml.v3"
|
||||||
)
|
)
|
||||||
|
@ -113,11 +112,9 @@ func (a *goBlog) postSummary(p *post) (summary string) {
|
||||||
}
|
}
|
||||||
html := string(a.postHtml(p, false))
|
html := string(a.postHtml(p, false))
|
||||||
if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 {
|
if splitted := strings.Split(html, summaryDivider); len(splitted) > 1 {
|
||||||
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(splitted[0]))
|
summary = htmlText(splitted[0])
|
||||||
summary = doc.Text()
|
|
||||||
} else {
|
} else {
|
||||||
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
|
summary = strings.Split(htmlText(html), "\n\n")[0]
|
||||||
summary = doc.Find("p").First().Text()
|
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
2
tts.go
2
tts.go
|
@ -64,7 +64,7 @@ func (a *goBlog) createPostTTSAudio(p *post) error {
|
||||||
ssml.WriteString("<speak>")
|
ssml.WriteString("<speak>")
|
||||||
ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
|
ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
|
||||||
ssml.WriteString("<break time=\"1s\"/>")
|
ssml.WriteString("<break time=\"1s\"/>")
|
||||||
ssml.WriteString(html.EscapeString(cleanHTMLText(string(a.postHtml(p, false)))))
|
ssml.WriteString(html.EscapeString(htmlText(string(a.postHtml(p, false)))))
|
||||||
ssml.WriteString("</speak>")
|
ssml.WriteString("</speak>")
|
||||||
|
|
||||||
// Generate audio
|
// Generate audio
|
||||||
|
|
44
utils.go
44
utils.go
|
@ -246,24 +246,48 @@ func mBytesString(size int64) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func htmlText(s string) string {
|
func htmlText(s string) string {
|
||||||
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(s))
|
// Build policy to only allow a subset of HTML tags
|
||||||
|
textPolicy := bluemonday.StrictPolicy()
|
||||||
|
textPolicy.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") // Headers
|
||||||
|
textPolicy.AllowElements("p") // Paragraphs
|
||||||
|
textPolicy.AllowElements("ol", "ul", "li") // Lists
|
||||||
|
textPolicy.AllowElements("blockquote") // Blockquotes
|
||||||
|
// Filter HTML tags
|
||||||
|
htmlBuf := textPolicy.SanitizeReader(strings.NewReader(s))
|
||||||
|
// Read HTML into document
|
||||||
|
doc, _ := goquery.NewDocumentFromReader(htmlBuf)
|
||||||
var text strings.Builder
|
var text strings.Builder
|
||||||
paragraphs := doc.Find("p")
|
if bodyChild := doc.Find("body").Children(); bodyChild.Length() > 0 {
|
||||||
if paragraphs.Length() == 0 {
|
// Input was real HTML, so build the text from the body
|
||||||
text.WriteString(doc.Text())
|
// Declare recursive function to print childs
|
||||||
} else {
|
var printChilds func(childs *goquery.Selection)
|
||||||
paragraphs.Each(func(i int, s *goquery.Selection) {
|
printChilds = func(childs *goquery.Selection) {
|
||||||
if i > 0 {
|
childs.Each(func(i int, sel *goquery.Selection) {
|
||||||
|
if i > 0 && // Not first child
|
||||||
|
sel.Is("h1, h2, h3, h4, h5, h6, p, ol, ul, li, blockquote") { // All elements that start a new paragraph
|
||||||
text.WriteString("\n\n")
|
text.WriteString("\n\n")
|
||||||
}
|
}
|
||||||
text.WriteString(s.Text())
|
if sel.Is("ol > li") { // List item in ordered list
|
||||||
|
fmt.Fprintf(&text, "%d. ", i+1) // Add list item number
|
||||||
|
}
|
||||||
|
if sel.Children().Length() > 0 { // Has children
|
||||||
|
printChilds(sel.Children()) // Recursive call to print childs
|
||||||
|
} else {
|
||||||
|
text.WriteString(sel.Text()) // Print text
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
r := strings.TrimSpace(text.String())
|
printChilds(bodyChild)
|
||||||
return r
|
} else {
|
||||||
|
// Input was probably just text, so just use the text
|
||||||
|
text.WriteString(doc.Text())
|
||||||
|
}
|
||||||
|
// Trim whitespace and return
|
||||||
|
return strings.TrimSpace(text.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
func cleanHTMLText(s string) string {
|
func cleanHTMLText(s string) string {
|
||||||
|
// Clean HTML with UGC policy and return text
|
||||||
return htmlText(bluemonday.UGCPolicy().Sanitize(s))
|
return htmlText(bluemonday.UGCPolicy().Sanitize(s))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,13 +76,33 @@ func Test_urlHasExt(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func Test_cleanHTMLText(t *testing.T) {
|
func Test_htmlText(t *testing.T) {
|
||||||
assert.Equal(t, `"This is a 'test'" 😁`, cleanHTMLText(`"This is a 'test'" 😁`))
|
// Text without HTML
|
||||||
assert.Equal(t, `Test`, cleanHTMLText(`<b>Test</b>`))
|
assert.Equal(t, "This is a test", htmlText("This is a test"))
|
||||||
assert.Equal(t, "Test\n\nTest", cleanHTMLText(`<p>Test</p><p>Test</p>`))
|
// Text without HTML and Emojis
|
||||||
assert.Equal(t, "Test\n\nTest", cleanHTMLText("<p>Test</p>\n<p>Test</p>"))
|
assert.Equal(t, "This is a test 😁", htmlText("This is a test 😁"))
|
||||||
assert.Equal(t, "Test\n\nTest", cleanHTMLText("<div><p>Test</p>\n<p>Test</p></div>"))
|
// Text without HTML and quoutes
|
||||||
assert.Equal(t, "Test test\n\nTest", cleanHTMLText(`<p>Test <b>test</b></p><p>Test</p>`))
|
assert.Equal(t, "This is a 'test'", htmlText("This is a 'test'"))
|
||||||
|
// Text with formatting (like bold or italic)
|
||||||
|
assert.Equal(t, "This is a test", htmlText("<b>This is a test</b>"))
|
||||||
|
assert.Equal(t, "This is a test", htmlText("<i>This is a test</i>"))
|
||||||
|
// Unordered list
|
||||||
|
assert.Equal(t, "Test\n\nTest", htmlText(`<ul><li>Test</li><li>Test</li></ul>`))
|
||||||
|
// Ordered list
|
||||||
|
assert.Equal(t, "1. Test\n\n2. Test", htmlText(`<ol><li>Test</li><li>Test</li></ol>`))
|
||||||
|
// Nested unordered list
|
||||||
|
assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<ul><li>Test</li><li><ul><li>Test</li><li>Test</li></ul></li></ul>`))
|
||||||
|
// Headers and paragraphs
|
||||||
|
assert.Equal(t, "Test\n\nTest", htmlText(`<h1>Test</h1><p>Test</p>`))
|
||||||
|
assert.Equal(t, "Test\n\nTest\n\nTest", htmlText(`<h1>Test</h1><p>Test</p><h2>Test</h2>`))
|
||||||
|
// Blockquote
|
||||||
|
assert.Equal(t, "Test\n\nBlockqoute content", htmlText(`<p>Test</p><blockquote><p>Blockqoute content</p></blockquote>`))
|
||||||
|
// Nested blockquotes
|
||||||
|
assert.Equal(t, "Blockqoute content\n\nBlockqoute content", htmlText(`<blockquote><p>Blockqoute content</p><blockquote><p>Blockqoute content</p></blockquote></blockquote>`))
|
||||||
|
// Code (should be ignored)
|
||||||
|
assert.Equal(t, "Test", htmlText(`<p>Test</p><pre><code>Code content</code></pre>`))
|
||||||
|
// Inline code (should not be ignored)
|
||||||
|
assert.Equal(t, "Test Code content", htmlText(`<p>Test <code>Code content</code></p>`))
|
||||||
}
|
}
|
||||||
|
|
||||||
func Test_containsStrings(t *testing.T) {
|
func Test_containsStrings(t *testing.T) {
|
||||||
|
|
Loading…
Reference in New Issue