2021-04-03 15:39:43 +02:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2021-07-13 11:21:32 +02:00
|
|
|
"context"
|
2021-04-03 15:39:43 +02:00
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
2021-07-13 11:21:32 +02:00
|
|
|
"sync/atomic"
|
2021-04-03 15:39:43 +02:00
|
|
|
"time"
|
2021-07-13 11:21:32 +02:00
|
|
|
|
2022-02-16 19:12:54 +01:00
|
|
|
"github.com/klauspost/compress/gzhttp"
|
2022-02-11 16:19:10 +01:00
|
|
|
"go.goblog.app/app/pkgs/bufferpool"
|
2021-07-13 11:21:32 +02:00
|
|
|
"golang.org/x/sync/singleflight"
|
2021-04-03 15:39:43 +02:00
|
|
|
)
|
|
|
|
|
2021-06-06 14:39:42 +02:00
|
|
|
func (a *goBlog) checkAllExternalLinks() {
|
2021-07-13 11:21:32 +02:00
|
|
|
// Get all published posts without parameters
|
2021-08-05 08:09:34 +02:00
|
|
|
posts, err := a.getPosts(&postsRequestConfig{status: statusPublished, withoutParameters: true})
|
2021-04-03 15:39:43 +02:00
|
|
|
if err != nil {
|
|
|
|
log.Println(err.Error())
|
|
|
|
return
|
|
|
|
}
|
2021-07-14 15:44:57 +02:00
|
|
|
_ = a.checkLinks(log.Writer(), posts...)
|
2021-07-13 11:21:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (a *goBlog) checkLinks(w io.Writer, posts ...*post) error {
|
|
|
|
// Get all links
|
|
|
|
allLinks, err := a.allLinks(posts...)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
fmt.Fprintln(w, "Checking", len(allLinks), "links")
|
|
|
|
// Cancel context
|
|
|
|
var canceled, finished atomic.Value
|
|
|
|
canceled.Store(false)
|
|
|
|
finished.Store(false)
|
|
|
|
cancelContext, cancelFunc := context.WithCancel(context.Background())
|
|
|
|
a.shutdown.Add(func() {
|
|
|
|
if finished.Load().(bool) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
canceled.Store(true)
|
|
|
|
cancelFunc()
|
|
|
|
fmt.Fprintln(w, "Canceled link check")
|
|
|
|
})
|
|
|
|
// Create HTTP client
|
2021-04-03 15:39:43 +02:00
|
|
|
client := &http.Client{
|
|
|
|
Timeout: 30 * time.Second,
|
2022-02-16 19:12:54 +01:00
|
|
|
Transport: gzhttp.Transport(&http.Transport{
|
2021-07-13 11:21:32 +02:00
|
|
|
// Limits
|
|
|
|
DisableKeepAlives: true,
|
|
|
|
MaxConnsPerHost: 1,
|
2022-02-16 19:12:54 +01:00
|
|
|
}),
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
// Process all links
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
var sm sync.Map
|
|
|
|
var sg singleflight.Group
|
|
|
|
con := make(chan bool, 5)
|
|
|
|
for _, l := range allLinks {
|
|
|
|
con <- true // This waits until there's space in the buffered channel
|
|
|
|
// Check if check is canceled
|
|
|
|
if canceled.Load().(bool) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
// Increase wait group
|
2021-06-06 14:39:42 +02:00
|
|
|
wg.Add(1)
|
2021-07-13 11:21:32 +02:00
|
|
|
// Start link check
|
|
|
|
go func(link *stringPair) {
|
|
|
|
defer func() {
|
|
|
|
<-con // Read from channel, to let next routine execute
|
|
|
|
wg.Done()
|
|
|
|
}()
|
|
|
|
// Check if link is internal
|
|
|
|
if strings.HasPrefix(link.Second, a.cfg.Server.PublicAddress) {
|
|
|
|
return
|
2021-06-06 14:39:42 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
// Process link
|
|
|
|
r, err, _ := sg.Do(link.Second, func() (interface{}, error) {
|
|
|
|
// Check if already cached
|
|
|
|
if mr, ok := sm.Load(link.Second); ok {
|
|
|
|
return mr, nil
|
|
|
|
}
|
|
|
|
// Do request
|
|
|
|
req, err := http.NewRequestWithContext(cancelContext, http.MethodGet, link.Second, nil)
|
2021-06-06 14:39:42 +02:00
|
|
|
if err != nil {
|
2021-07-13 11:21:32 +02:00
|
|
|
return nil, err
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
2021-06-06 14:39:42 +02:00
|
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0")
|
|
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
|
|
|
resp, err := client.Do(req)
|
|
|
|
if err != nil {
|
2021-07-13 11:21:32 +02:00
|
|
|
return nil, err
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
_ = resp.Body.Close()
|
|
|
|
// Cache result
|
|
|
|
sm.Store(link.Second, resp.StatusCode)
|
|
|
|
// Return result
|
|
|
|
return resp.StatusCode, nil
|
|
|
|
})
|
|
|
|
// Check error
|
|
|
|
if err != nil {
|
|
|
|
if !strings.Contains(err.Error(), "context canceled") {
|
|
|
|
fmt.Fprintln(w, "Error:", link.Second, err.Error())
|
|
|
|
}
|
|
|
|
return
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
// Check status code
|
|
|
|
if statusCode := r.(int); !successStatus(statusCode) {
|
|
|
|
fmt.Fprintln(w, link.Second, "in", link.First, statusCode, http.StatusText(statusCode))
|
2021-06-06 14:39:42 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
}(l)
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
// Wait for all links to finish
|
2021-04-03 15:39:43 +02:00
|
|
|
wg.Wait()
|
2021-07-13 11:21:32 +02:00
|
|
|
// Finish
|
|
|
|
finished.Store(true)
|
|
|
|
return nil
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
|
|
|
|
2021-07-13 11:21:32 +02:00
|
|
|
func (a *goBlog) allLinks(posts ...*post) (allLinks []*stringPair, err error) {
|
2021-04-03 15:39:43 +02:00
|
|
|
for _, p := range posts {
|
2022-02-11 16:19:10 +01:00
|
|
|
contentBuf := bufferpool.Get()
|
|
|
|
a.postHtmlToWriter(contentBuf, p, true)
|
|
|
|
links, err := allLinksFromHTML(contentBuf, a.fullPostURL(p))
|
|
|
|
bufferpool.Put(contentBuf)
|
2021-07-13 11:21:32 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
for _, link := range links {
|
|
|
|
allLinks = append(allLinks, &stringPair{a.fullPostURL(p), link})
|
|
|
|
}
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|
2021-07-13 11:21:32 +02:00
|
|
|
return allLinks, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func successStatus(status int) bool {
|
|
|
|
return status >= 200 && status < 400
|
2021-04-03 15:39:43 +02:00
|
|
|
}
|