GoBlog/check.go

148 lines
3.7 KiB
Go
Raw Normal View History

package main
import (
2021-07-13 09:21:32 +00:00
"context"
"fmt"
"io"
"log"
"net/http"
"strings"
"sync"
2021-07-13 09:21:32 +00:00
"sync/atomic"
"time"
2021-07-13 09:21:32 +00:00
2022-02-16 18:12:54 +00:00
"github.com/klauspost/compress/gzhttp"
2022-03-16 07:28:03 +00:00
"github.com/samber/lo"
"go.goblog.app/app/pkgs/bufferpool"
2021-07-13 09:21:32 +00:00
"golang.org/x/sync/singleflight"
)
func (a *goBlog) checkAllExternalLinks() {
posts, err := a.getPosts(&postsRequestConfig{
status: []postStatus{statusPublished},
visibility: []postVisibility{visibilityPublic, visibilityUnlisted},
withoutParameters: true,
})
if err != nil {
log.Println(err.Error())
return
}
_ = a.checkLinks(log.Writer(), posts...)
2021-07-13 09:21:32 +00:00
}
func (a *goBlog) checkLinks(w io.Writer, posts ...*post) error {
// Get all links
allLinks, err := a.allLinks(posts...)
if err != nil {
return err
}
fmt.Fprintln(w, "Checking", len(allLinks), "links")
// Cancel context
var canceled, finished atomic.Value
canceled.Store(false)
finished.Store(false)
cancelContext, cancelFunc := context.WithCancel(context.Background())
a.shutdown.Add(func() {
if finished.Load().(bool) {
return
}
canceled.Store(true)
cancelFunc()
fmt.Fprintln(w, "Canceled link check")
})
// Create HTTP client
client := &http.Client{
Timeout: 30 * time.Second,
2022-02-16 18:12:54 +00:00
Transport: gzhttp.Transport(&http.Transport{
2021-07-13 09:21:32 +00:00
// Limits
DisableKeepAlives: true,
MaxConnsPerHost: 1,
2022-02-16 18:12:54 +00:00
}),
}
2021-07-13 09:21:32 +00:00
// Process all links
var wg sync.WaitGroup
var sm sync.Map
var sg singleflight.Group
con := make(chan bool, 5)
for _, l := range allLinks {
con <- true // This waits until there's space in the buffered channel
// Check if check is canceled
if canceled.Load().(bool) {
break
}
// Increase wait group
wg.Add(1)
2021-07-13 09:21:32 +00:00
// Start link check
go func(link *stringPair) {
defer func() {
<-con // Read from channel, to let next routine execute
wg.Done()
}()
// Check if link is internal
if strings.HasPrefix(link.Second, a.cfg.Server.PublicAddress) {
return
}
2021-07-13 09:21:32 +00:00
// Process link
2022-03-16 07:28:03 +00:00
r, err, _ := sg.Do(link.Second, func() (any, error) {
2021-07-13 09:21:32 +00:00
// Check if already cached
if mr, ok := sm.Load(link.Second); ok {
return mr, nil
}
// Do request
req, err := http.NewRequestWithContext(cancelContext, http.MethodGet, link.Second, nil)
if err != nil {
2021-07-13 09:21:32 +00:00
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
resp, err := client.Do(req)
if err != nil {
2021-07-13 09:21:32 +00:00
return nil, err
}
2021-07-13 09:21:32 +00:00
_ = resp.Body.Close()
// Cache result
sm.Store(link.Second, resp.StatusCode)
// Return result
return resp.StatusCode, nil
})
// Check error
if err != nil {
if !strings.Contains(err.Error(), "context canceled") {
fmt.Fprintln(w, "Error:", link.Second, err.Error())
}
return
}
2021-07-13 09:21:32 +00:00
// Check status code
if statusCode := r.(int); !successStatus(statusCode) {
fmt.Fprintln(w, link.Second, "in", link.First, statusCode, http.StatusText(statusCode))
}
2021-07-13 09:21:32 +00:00
}(l)
}
2021-07-13 09:21:32 +00:00
// Wait for all links to finish
wg.Wait()
2021-07-13 09:21:32 +00:00
// Finish
finished.Store(true)
return nil
}
2021-07-13 09:21:32 +00:00
func (a *goBlog) allLinks(posts ...*post) (allLinks []*stringPair, err error) {
for _, p := range posts {
contentBuf := bufferpool.Get()
2022-12-26 18:52:06 +00:00
a.postHtmlToWriter(contentBuf, &postHtmlOptions{p: p, absolute: true})
links, err := allLinksFromHTML(contentBuf, a.fullPostURL(p))
bufferpool.Put(contentBuf)
2021-07-13 09:21:32 +00:00
if err != nil {
return nil, err
}
2022-03-16 07:28:03 +00:00
allLinks = lo.Map(links, func(s string, _ int) *stringPair {
return &stringPair{a.fullPostURL(p), s}
})
}
2021-07-13 09:21:32 +00:00
return allLinks, nil
}
func successStatus(status int) bool {
return status >= 200 && status < 400
}