mirror of https://github.com/jlelse/GoBlog
parent
34ab1b1fb2
commit
6bfaf16e25
@ -0,0 +1,188 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/carlmjohnson/requests"
|
||||
"github.com/dgraph-io/ristretto"
|
||||
"go.goblog.app/app/pkgs/bufferpool"
|
||||
"go.goblog.app/app/pkgs/contenttype"
|
||||
"go.goblog.app/app/pkgs/httpcachetransport"
|
||||
"willnorris.com/go/microformats"
|
||||
)
|
||||
|
||||
func (a *goBlog) initMicroformatsCache() {
|
||||
a.mfInit.Do(func() {
|
||||
a.mfCache, _ = ristretto.NewCache(&ristretto.Config{
|
||||
NumCounters: 100,
|
||||
MaxCost: 10, // Cache http responses for 10 requests
|
||||
BufferItems: 64,
|
||||
IgnoreInternalCost: true,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
type microformatsResult struct {
|
||||
Title, Content, Author, Url string
|
||||
source string
|
||||
hasUrl bool
|
||||
}
|
||||
|
||||
func (a *goBlog) parseMicroformats(u string, cache bool) (*microformatsResult, error) {
|
||||
buf := bufferpool.Get()
|
||||
defer bufferpool.Put(buf)
|
||||
rb := requests.URL(u).
|
||||
Method(http.MethodGet).
|
||||
Accept(contenttype.HTMLUTF8).
|
||||
Client(a.httpClient).
|
||||
ToBytesBuffer(buf)
|
||||
if cache {
|
||||
a.initMicroformatsCache()
|
||||
rb.Transport(httpcachetransport.NewHttpCacheTransport(a.httpClient.Transport, a.mfCache, 10*time.Minute))
|
||||
}
|
||||
err := rb.Fetch(context.Background())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return a.parseMicroformatsFromBytes(u, buf.Bytes())
|
||||
}
|
||||
|
||||
func (a *goBlog) parseMicroformatsFromBytes(u string, b []byte) (*microformatsResult, error) {
|
||||
parsedUrl, err := url.Parse(u)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
m := µformatsResult{
|
||||
source: u,
|
||||
}
|
||||
// Fill from microformats
|
||||
m.fillFromData(microformats.Parse(bytes.NewReader(b), parsedUrl))
|
||||
if m.Url == "" {
|
||||
m.Url = u
|
||||
}
|
||||
// Set title when content is empty as well
|
||||
if m.Title == "" && m.Content == "" {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(b))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if title := doc.Find("title"); title != nil {
|
||||
m.Title = title.Text()
|
||||
}
|
||||
}
|
||||
// Reset title if it's just a prefix of the content
|
||||
if m.Title != "" && strings.HasPrefix(m.Content, m.Title) {
|
||||
m.Title = ""
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m *microformatsResult) fillFromData(mf *microformats.Data) {
|
||||
// Fill data
|
||||
for _, i := range mf.Items {
|
||||
if m.fill(i) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *microformatsResult) fill(mf *microformats.Microformat) bool {
|
||||
if mfHasType(mf, "h-entry") {
|
||||
// Check URL
|
||||
if url, ok := mf.Properties["url"]; ok && len(url) > 0 {
|
||||
if url0, ok := url[0].(string); ok {
|
||||
if strings.EqualFold(url0, m.source) {
|
||||
// Is searched entry
|
||||
m.hasUrl = true
|
||||
m.Url = url0
|
||||
// Reset attributes to refill
|
||||
m.Author = ""
|
||||
m.Title = ""
|
||||
m.Content = ""
|
||||
} else if m.hasUrl {
|
||||
// Already found entry
|
||||
return false
|
||||
} else if m.Url == "" {
|
||||
// Is the first entry
|
||||
m.Url = url0
|
||||
} else {
|
||||
// Is not the first entry
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
// Title
|
||||
m.fillTitle(mf)
|
||||
// Content
|
||||
m.fillContent(mf)
|
||||
// Author
|
||||
m.fillAuthor(mf)
|
||||
return m.hasUrl
|
||||
}
|
||||
for _, mfc := range mf.Children {
|
||||
if m.fill(mfc) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (m *microformatsResult) fillTitle(mf *microformats.Microformat) {
|
||||
if m.Title != "" {
|
||||
return
|
||||
}
|
||||
if name, ok := mf.Properties["name"]; ok && len(name) > 0 {
|
||||
if title, ok := name[0].(string); ok {
|
||||
m.Title = strings.TrimSpace(title)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *microformatsResult) fillContent(mf *microformats.Microformat) {
|
||||
if m.Content != "" {
|
||||
return
|
||||
}
|
||||
if contents, ok := mf.Properties["content"]; ok && len(contents) > 0 {
|
||||
if content, ok := contents[0].(map[string]string); ok {
|
||||
if contentHTML, ok := content["html"]; ok {
|
||||
m.Content = cleanHTMLText(contentHTML)
|
||||
// Replace newlines with spaces
|
||||
m.Content = strings.ReplaceAll(m.Content, "\n", " ")
|
||||
// Collapse double spaces
|
||||
m.Content = strings.Join(strings.Fields(m.Content), " ")
|
||||
// Trim spaces
|
||||
m.Content = strings.TrimSpace(m.Content)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *microformatsResult) fillAuthor(mf *microformats.Microformat) {
|
||||
if m.Author != "" {
|
||||
return
|
||||
}
|
||||
if authors, ok := mf.Properties["author"]; ok && len(authors) > 0 {
|
||||
if author, ok := authors[0].(*microformats.Microformat); ok {
|
||||
if names, ok := author.Properties["name"]; ok && len(names) > 0 {
|
||||
if name, ok := names[0].(string); ok {
|
||||
m.Author = strings.TrimSpace(name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func mfHasType(mf *microformats.Microformat, typ string) bool {
|
||||
for _, t := range mf.Type {
|
||||
if typ == t {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func Test_parseMicroformats(t *testing.T) {
|
||||
|
||||
app := &goBlog{
|
||||
cfg: createDefaultTestConfig(t),
|
||||
}
|
||||
err := app.initConfig(false)
|
||||
require.NoError(t, err)
|
||||
|
||||
testHtmlBytes, err := os.ReadFile("testdata/wmtest.html")
|
||||
require.NoError(t, err)
|
||||
testHtml := string(testHtmlBytes)
|
||||
|
||||
mockClient := newFakeHttpClient()
|
||||
mockClient.setFakeResponse(http.StatusOK, testHtml)
|
||||
|
||||
app.httpClient = mockClient.Client
|
||||
|
||||
m, err := app.parseMicroformats("https://example.net/articles/micropub-crossposting-to-twitter-and-enabling-tweetstorms", false)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "Micropub, Crossposting to Twitter, and Enabling “Tweetstorms”", m.Title)
|
||||
assert.NotEmpty(t, m.Content)
|
||||
assert.Equal(t, "Test Blogger", m.Author)
|
||||
assert.Equal(t, "https://example.net/articles/micropub-crossposting-to-twitter-and-enabling-tweetstorms", m.Url)
|
||||
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package httpcachetransport
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"time"
|
||||
|
||||
"github.com/dgraph-io/ristretto"
|
||||
)
|
||||
|
||||
type httpCacheTransport struct {
|
||||
parent http.RoundTripper
|
||||
ristrettoCache *ristretto.Cache
|
||||
ttl time.Duration
|
||||
}
|
||||
|
||||
func (t *httpCacheTransport) RoundTrip(r *http.Request) (*http.Response, error) {
|
||||
requestUrl := r.URL.String()
|
||||
if t.ristrettoCache != nil {
|
||||
if cached, hasCached := t.ristrettoCache.Get(requestUrl); hasCached {
|
||||
if cachedResp, ok := cached.([]byte); ok {
|
||||
return http.ReadResponse(bufio.NewReader(bytes.NewReader(cachedResp)), r)
|
||||
}
|
||||
}
|
||||
}
|
||||
resp, err := t.parent.RoundTrip(r)
|
||||
if err == nil && t.ristrettoCache != nil {
|
||||
respBytes, err := httputil.DumpResponse(resp, true)
|
||||
if err != nil {
|
||||
return resp, err
|
||||
}
|
||||
t.ristrettoCache.SetWithTTL(requestUrl, respBytes, 1, t.ttl)
|
||||
t.ristrettoCache.Wait()
|
||||
return http.ReadResponse(bufio.NewReader(bytes.NewReader(respBytes)), r)
|
||||
}
|
||||
return resp, err
|
||||
}
|
||||
|
||||
// Creates a new http.RoundTripper that caches all
|
||||
// request responses (by the request URL) in ristretto.
|
||||
func NewHttpCacheTransport(parent http.RoundTripper, ristrettoCache *ristretto.Cache, ttl time.Duration) http.RoundTripper {
|
||||
return &httpCacheTransport{parent, ristrettoCache, ttl}
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package httpcachetransport
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/carlmjohnson/requests"
|
||||
"github.com/dgraph-io/ristretto"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
const fakeResponse = `HTTP/1.1 200 OK
|
||||
Content-Type: text/html; charset=UTF-8
|
||||
Date: Wed, 14 Dec 2022 10:34:03 GMT
|
||||
|
||||
<!doctype html>
|
||||
<html>
|
||||
</html>`
|
||||
|
||||
func TestHttpCacheTransport(t *testing.T) {
|
||||
cache, _ := ristretto.NewCache(&ristretto.Config{
|
||||
NumCounters: 100,
|
||||
MaxCost: 10,
|
||||
BufferItems: 64,
|
||||
IgnoreInternalCost: true,
|
||||
})
|
||||
|
||||
counter := 0
|
||||
|
||||
orig := requests.RoundTripFunc(func(req *http.Request) (res *http.Response, err error) {
|
||||
counter++
|
||||
return http.ReadResponse(bufio.NewReader(strings.NewReader(fakeResponse)), req)
|
||||
})
|
||||
|
||||
client := &http.Client{
|
||||
Transport: NewHttpCacheTransport(orig, cache, time.Minute),
|
||||
}
|
||||
|
||||
err := requests.URL("https://example.com/").Client(client).Fetch(context.Background())
|
||||
assert.NoError(t, err)
|
||||
|
||||
err = requests.URL("https://example.com/").Client(client).Fetch(context.Background())
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Equal(t, 1, counter)
|
||||
}
|
Loading…
Reference in new issue