From a3517a9a97b0b7638ac0405d661b9704c3abe07e Mon Sep 17 00:00:00 2001 From: Jan-Lukas Else Date: Wed, 5 Jan 2022 10:56:53 +0100 Subject: [PATCH] Fix TTS for long texts --- go.mod | 1 + go.sum | 2 + pkgs/mp3merge/mp3merge.go | 77 +++++++++++++++++++++++++++++++++++++++ tts.go | 75 +++++++++++++++++++++++++------------- 4 files changed, 130 insertions(+), 25 deletions(-) create mode 100644 pkgs/mp3merge/mp3merge.go diff --git a/go.mod b/go.mod index a63313c..2dcc387 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/cretz/bine v0.2.0 github.com/dchest/captcha v0.0.0-20200903113550-03f5f0333e1f github.com/dgraph-io/ristretto v0.1.0 + github.com/dmulholl/mp3lib v1.0.0 github.com/elnormous/contenttype v1.0.0 github.com/emersion/go-sasl v0.0.0-20211008083017-0b9dcfb154ac github.com/emersion/go-smtp v0.15.0 diff --git a/go.sum b/go.sum index 7030e6e..ff2a31d 100644 --- a/go.sum +++ b/go.sum @@ -97,6 +97,8 @@ github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUn github.com/djherbis/atime v1.1.0/go.mod h1:28OF6Y8s3NQWwacXc5eZTsEsiMzp7LF8MbXE+XJPdBE= github.com/dlclark/regexp2 v1.4.0 h1:F1rxgk7p4uKjwIQxBs9oAXe5CqrXlCduYEJvrF4u93E= github.com/dlclark/regexp2 v1.4.0/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc= +github.com/dmulholl/mp3lib v1.0.0 h1:PZq24kJBIk5zIxi/t6Qp8/EOAbAqThyrUCpkUKLBeWQ= +github.com/dmulholl/mp3lib v1.0.0/go.mod h1:4RoA+iht/khfwxmH1ieoxZTzYVbb0am/zdvFkyGRr6I= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dvyukov/go-fuzz v0.0.0-20210103155950-6a8e9d1f2415/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= diff --git a/pkgs/mp3merge/mp3merge.go b/pkgs/mp3merge/mp3merge.go new file mode 100644 index 0000000..50aac50 --- /dev/null +++ b/pkgs/mp3merge/mp3merge.go @@ -0,0 +1,77 @@ +package mp3merge + +import ( + "bytes" + "errors" + "io" + + "github.com/dmulholl/mp3lib" +) + +// Inspired by https://github.com/dmulholl/mp3cat/blob/2ec1e4fe4d995ebd41bf1887b3cab8e2a569b3d4/mp3cat.go +// Merge multiple mp3s into one mp3. +func MergeMP3(out io.Writer, in ...io.Reader) error { + if len(in) == 0 { + return errors.New("no inputs specified") + } + + var totalFrames, totalBytes uint32 + var firstBitRate int + var isVBR bool + var tmpOut bytes.Buffer + + // Loop over the input files and append their MP3 frames to the output file. + for _, inReader := range in { + if inReader == nil { + return errors.New("nil input") + } + + isFirstFrame := true + + for { + // Read the next frame from the input + frame := mp3lib.NextFrame(inReader) + if frame == nil { + break + } + + // Skip the first frame if it's a VBR header + if isFirstFrame { + isFirstFrame = false + if mp3lib.IsXingHeader(frame) || mp3lib.IsVbriHeader(frame) { + continue + } + } + + // If we detect more than one bitrate we'll need to add a VBR header to the output + if firstBitRate == 0 { + firstBitRate = frame.BitRate + } else if frame.BitRate != firstBitRate { + isVBR = true + } + + // Write the frame to the temporary output + _, err := tmpOut.Write(frame.RawBytes) + if err != nil { + return err + } + + // Increment the total number of frames and bytes + totalFrames += 1 + totalBytes += uint32(len(frame.RawBytes)) + } + } + + // If we detected multiple bitrates, prepend a VBR header to the output + if isVBR { + xingHeader := mp3lib.NewXingHeader(totalFrames, totalBytes) + _, err := out.Write(xingHeader.RawBytes) + if err != nil { + return err + } + } + + // Copy the temporary output to the output + _, err := tmpOut.WriteTo(out) + return err +} diff --git a/tts.go b/tts.go index c9723db..55d9dc7 100644 --- a/tts.go +++ b/tts.go @@ -12,8 +12,10 @@ import ( "net/url" "path" "strings" + "sync" "github.com/carlmjohnson/requests" + "go.goblog.app/app/pkgs/mp3merge" ) const ttsParameter = "tts" @@ -54,31 +56,60 @@ func (a *goBlog) ttsEnabled() bool { func (a *goBlog) createPostTTSAudio(p *post) error { // Get required values - lang := a.cfg.Blogs[p.Blog].Lang - if lang == "" { - lang = "en" + lang := defaultIfEmpty(a.cfg.Blogs[p.Blog].Lang, "en") + + // Create TTS text parts + parts := []string{} + // Add title if available + if title := p.Title(); title != "" { + parts = append(parts, a.renderMdTitle(title)) + } + // Add body split into paragraphs because of 5000 character limit + parts = append(parts, strings.Split(htmlText(string(a.postHtml(p, false))), "\n\n")...) + + // Create TTS audio for each part + partsBuffers := make([]io.Reader, len(parts)) + var errs []error + var lock sync.Mutex + var wg sync.WaitGroup + for i, part := range parts { + // Increase wait group + wg.Add(1) + go func(i int, part string) { + // Build SSML + ssml := "" + html.EscapeString(part) + "" + // Create TTS audio + var audioBuffer bytes.Buffer + err := a.createTTSAudio(lang, ssml, &audioBuffer) + if err != nil { + lock.Lock() + errs = append(errs, err) + lock.Unlock() + return + } + // Append buffer to partsBuffers + lock.Lock() + partsBuffers[i] = &audioBuffer + lock.Unlock() + // Decrease wait group + wg.Done() + }(i, part) } - // Build SSML - var ssml strings.Builder - ssml.WriteString("") - ssml.WriteString(html.EscapeString(a.renderMdTitle(p.Title()))) - ssml.WriteString("") - for _, part := range strings.Split(htmlText(string(a.postHtml(p, false))), "\n\n") { - ssml.WriteString(html.EscapeString(part)) - ssml.WriteString("") - } - ssml.WriteString("") + // Wait for all parts to be created + wg.Wait() - // Generate audio - var audioBuffer bytes.Buffer - err := a.createTTSAudio(lang, ssml.String(), &audioBuffer) - if err != nil { - return err + // Check if any errors occurred + if len(errs) > 0 { + return errs[0] } + // Merge partsBuffers into final buffer + var final bytes.Buffer + mp3merge.MergeMP3(&final, partsBuffers...) + // Save audio - audioReader := bytes.NewReader(audioBuffer.Bytes()) + audioReader := bytes.NewReader(final.Bytes()) fileHash, err := getSHA256(audioReader) if err != nil { return err @@ -155,12 +186,6 @@ func (a *goBlog) createTTSAudio(lang, ssml string, w io.Writer) error { return errors.New("writer not provided") } - // Check max length - // TODO: Support longer texts by splitting into multiple requests - // if len(ssml) > 5000 { - // return errors.New("text is too long") - // } - // Create request body body := map[string]interface{}{ "audioConfig": map[string]interface{}{