Fix TTS for long texts

This commit is contained in:
Jan-Lukas Else 2022-01-05 10:56:53 +01:00
parent fac8129c99
commit a3517a9a97
4 changed files with 130 additions and 25 deletions

1
go.mod
View File

@ -15,6 +15,7 @@ require (
github.com/cretz/bine v0.2.0
github.com/dchest/captcha v0.0.0-20200903113550-03f5f0333e1f
github.com/dgraph-io/ristretto v0.1.0
github.com/dmulholl/mp3lib v1.0.0
github.com/elnormous/contenttype v1.0.0
github.com/emersion/go-sasl v0.0.0-20211008083017-0b9dcfb154ac
github.com/emersion/go-smtp v0.15.0

2
go.sum
View File

@ -97,6 +97,8 @@ github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUn
github.com/djherbis/atime v1.1.0/go.mod h1:28OF6Y8s3NQWwacXc5eZTsEsiMzp7LF8MbXE+XJPdBE=
github.com/dlclark/regexp2 v1.4.0 h1:F1rxgk7p4uKjwIQxBs9oAXe5CqrXlCduYEJvrF4u93E=
github.com/dlclark/regexp2 v1.4.0/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc=
github.com/dmulholl/mp3lib v1.0.0 h1:PZq24kJBIk5zIxi/t6Qp8/EOAbAqThyrUCpkUKLBeWQ=
github.com/dmulholl/mp3lib v1.0.0/go.mod h1:4RoA+iht/khfwxmH1ieoxZTzYVbb0am/zdvFkyGRr6I=
github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/dvyukov/go-fuzz v0.0.0-20210103155950-6a8e9d1f2415/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=

77
pkgs/mp3merge/mp3merge.go Normal file
View File

@ -0,0 +1,77 @@
package mp3merge
import (
"bytes"
"errors"
"io"
"github.com/dmulholl/mp3lib"
)
// Inspired by https://github.com/dmulholl/mp3cat/blob/2ec1e4fe4d995ebd41bf1887b3cab8e2a569b3d4/mp3cat.go
// Merge multiple mp3s into one mp3.
func MergeMP3(out io.Writer, in ...io.Reader) error {
if len(in) == 0 {
return errors.New("no inputs specified")
}
var totalFrames, totalBytes uint32
var firstBitRate int
var isVBR bool
var tmpOut bytes.Buffer
// Loop over the input files and append their MP3 frames to the output file.
for _, inReader := range in {
if inReader == nil {
return errors.New("nil input")
}
isFirstFrame := true
for {
// Read the next frame from the input
frame := mp3lib.NextFrame(inReader)
if frame == nil {
break
}
// Skip the first frame if it's a VBR header
if isFirstFrame {
isFirstFrame = false
if mp3lib.IsXingHeader(frame) || mp3lib.IsVbriHeader(frame) {
continue
}
}
// If we detect more than one bitrate we'll need to add a VBR header to the output
if firstBitRate == 0 {
firstBitRate = frame.BitRate
} else if frame.BitRate != firstBitRate {
isVBR = true
}
// Write the frame to the temporary output
_, err := tmpOut.Write(frame.RawBytes)
if err != nil {
return err
}
// Increment the total number of frames and bytes
totalFrames += 1
totalBytes += uint32(len(frame.RawBytes))
}
}
// If we detected multiple bitrates, prepend a VBR header to the output
if isVBR {
xingHeader := mp3lib.NewXingHeader(totalFrames, totalBytes)
_, err := out.Write(xingHeader.RawBytes)
if err != nil {
return err
}
}
// Copy the temporary output to the output
_, err := tmpOut.WriteTo(out)
return err
}

75
tts.go
View File

@ -12,8 +12,10 @@ import (
"net/url"
"path"
"strings"
"sync"
"github.com/carlmjohnson/requests"
"go.goblog.app/app/pkgs/mp3merge"
)
const ttsParameter = "tts"
@ -54,31 +56,60 @@ func (a *goBlog) ttsEnabled() bool {
func (a *goBlog) createPostTTSAudio(p *post) error {
// Get required values
lang := a.cfg.Blogs[p.Blog].Lang
if lang == "" {
lang = "en"
lang := defaultIfEmpty(a.cfg.Blogs[p.Blog].Lang, "en")
// Create TTS text parts
parts := []string{}
// Add title if available
if title := p.Title(); title != "" {
parts = append(parts, a.renderMdTitle(title))
}
// Add body split into paragraphs because of 5000 character limit
parts = append(parts, strings.Split(htmlText(string(a.postHtml(p, false))), "\n\n")...)
// Create TTS audio for each part
partsBuffers := make([]io.Reader, len(parts))
var errs []error
var lock sync.Mutex
var wg sync.WaitGroup
for i, part := range parts {
// Increase wait group
wg.Add(1)
go func(i int, part string) {
// Build SSML
ssml := "<speak>" + html.EscapeString(part) + "<break time=\"500ms\"/></speak>"
// Create TTS audio
var audioBuffer bytes.Buffer
err := a.createTTSAudio(lang, ssml, &audioBuffer)
if err != nil {
lock.Lock()
errs = append(errs, err)
lock.Unlock()
return
}
// Append buffer to partsBuffers
lock.Lock()
partsBuffers[i] = &audioBuffer
lock.Unlock()
// Decrease wait group
wg.Done()
}(i, part)
}
// Build SSML
var ssml strings.Builder
ssml.WriteString("<speak>")
ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title())))
ssml.WriteString("<break time=\"1s\"/>")
for _, part := range strings.Split(htmlText(string(a.postHtml(p, false))), "\n\n") {
ssml.WriteString(html.EscapeString(part))
ssml.WriteString("<break time=\"500ms\"/>")
}
ssml.WriteString("</speak>")
// Wait for all parts to be created
wg.Wait()
// Generate audio
var audioBuffer bytes.Buffer
err := a.createTTSAudio(lang, ssml.String(), &audioBuffer)
if err != nil {
return err
// Check if any errors occurred
if len(errs) > 0 {
return errs[0]
}
// Merge partsBuffers into final buffer
var final bytes.Buffer
mp3merge.MergeMP3(&final, partsBuffers...)
// Save audio
audioReader := bytes.NewReader(audioBuffer.Bytes())
audioReader := bytes.NewReader(final.Bytes())
fileHash, err := getSHA256(audioReader)
if err != nil {
return err
@ -155,12 +186,6 @@ func (a *goBlog) createTTSAudio(lang, ssml string, w io.Writer) error {
return errors.New("writer not provided")
}
// Check max length
// TODO: Support longer texts by splitting into multiple requests
// if len(ssml) > 5000 {
// return errors.New("text is too long")
// }
// Create request body
body := map[string]interface{}{
"audioConfig": map[string]interface{}{