feat: new `GetTextURLsToUnfurl` endpoint which extends GetURLs (#4294)

This commit is contained in:
Igor Sirotin 2023-11-17 13:32:37 +00:00 committed by GitHub
parent 4ae9c02e57
commit 2fef9a8f8f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 144 additions and 89 deletions

View File

@ -1 +1 @@
0.171.12
0.171.13

View File

@ -3,6 +3,7 @@ package protocol
import (
"errors"
"fmt"
"math"
"net/http"
neturl "net/url"
"regexp"
@ -19,6 +20,29 @@ import (
const UnfurledLinksPerMessageLimit = 5
type URLUnfurlPermission int
const (
URLUnfurlingAllowed URLUnfurlPermission = iota
URLUnfurlingAskUser
URLUnfurlingForbiddenBySettings
URLUnfurlingNotSupported
)
type URLUnfurlingMetadata struct {
URL string `json:"url"`
Permission URLUnfurlPermission `json:"permission"`
IsStatusSharedURL bool `json:"isStatusSharedURL"`
}
type URLsUnfurlPlan struct {
URLs []URLUnfurlingMetadata `json:"urls"`
}
func URLUnfurlingSupported(url string) bool {
return !strings.HasSuffix(url, ".gif")
}
type UnfurlURLsResponse struct {
LinkPreviews []*common.LinkPreview `json:"linkPreviews,omitempty"`
StatusLinkPreviews []*common.StatusLinkPreview `json:"statusLinkPreviews,omitempty"`
@ -92,18 +116,23 @@ func parseValidURL(rawURL string) (*neturl.URL, error) {
return u, nil
}
// GetURLs returns only what we consider unfurleable URLs.
//
// If we wanted to be extra precise and help improve UX, we could ignore URLs
// that we know can't be unfurled. This is at least possible with the oEmbed
// protocol because providers must specify an endpoint scheme.
func GetURLs(text string) []string {
func (m *Messenger) GetTextURLsToUnfurl(text string) *URLsUnfurlPlan {
s, err := m.getSettings()
if err != nil {
// log the error and keep parsing the text
m.logger.Error("GetTextURLsToUnfurl: failed to get settings", zap.Error(err))
s.URLUnfurlingMode = settings.URLUnfurlingDisableAll
}
indexedUrls := map[string]struct{}{}
result := &URLsUnfurlPlan{
// The usage of `UnfurledLinksPerMessageLimit` is quite random here. I wanted to allocate
// some not-zero place here, using the limit number is at least some binding.
URLs: make([]URLUnfurlingMetadata, 0, UnfurledLinksPerMessageLimit),
}
parsedText := markdown.Parse([]byte(text), nil)
visitor := common.RunLinksVisitor(parsedText)
urls := make([]string, 0, len(visitor.Links))
indexed := make(map[string]any, len(visitor.Links))
for _, rawURL := range visitor.Links {
parsedURL, err := parseValidURL(rawURL)
if err != nil {
@ -116,23 +145,54 @@ func GetURLs(text string) []string {
// case-sensitive, some websites encode base64 in the path, etc.
parsedURL.Host = strings.ToLower(parsedURL.Host)
idx := parsedURL.String()
// Removes the spurious trailing forward slash.
idx = strings.TrimRight(idx, "/")
if _, exists := indexed[idx]; exists {
url := parsedURL.String()
url = strings.TrimRight(url, "/") // Removes the spurious trailing forward slash.
if _, exists := indexedUrls[url]; exists {
continue
} else {
indexed[idx] = nil
urls = append(urls, idx)
}
// This is a temporary limitation solution,
// should be changed with https://github.com/status-im/status-go/issues/4235
if len(urls) == UnfurledLinksPerMessageLimit {
metadata := URLUnfurlingMetadata{
URL: url,
IsStatusSharedURL: IsStatusSharedURL(url),
}
if !URLUnfurlingSupported(rawURL) {
metadata.Permission = URLUnfurlingNotSupported
} else if metadata.IsStatusSharedURL {
metadata.Permission = URLUnfurlingAllowed
} else {
switch s.URLUnfurlingMode {
case settings.URLUnfurlingAlwaysAsk:
metadata.Permission = URLUnfurlingAskUser
case settings.URLUnfurlingEnableAll:
metadata.Permission = URLUnfurlingAllowed
case settings.URLUnfurlingDisableAll:
metadata.Permission = URLUnfurlingForbiddenBySettings
default:
metadata.Permission = URLUnfurlingForbiddenBySettings
}
}
result.URLs = append(result.URLs, metadata)
}
return result
}
// Deprecated: GetURLs is deprecated in favor of more generic GetTextURLsToUnfurl.
//
// This is a wrapper around GetTextURLsToUnfurl that returns the list of URLs found in the text
// without any additional information.
func (m *Messenger) GetURLs(text string) []string {
plan := m.GetTextURLsToUnfurl(text)
limit := int(math.Min(UnfurledLinksPerMessageLimit, float64(len(plan.URLs))))
urls := make([]string, 0, limit)
for _, metadata := range plan.URLs {
urls = append(urls, metadata.URL)
if len(urls) == limit {
break
}
}
return urls
}
@ -145,11 +205,6 @@ func NewDefaultHTTPClient() *http.Client {
func (m *Messenger) UnfurlURLs(httpClient *http.Client, urls []string) (UnfurlURLsResponse, error) {
response := UnfurlURLsResponse{}
s, err := m.getSettings()
if err != nil {
return response, fmt.Errorf("failed to get settigs: %w", err)
}
// Unfurl in a loop
response.LinkPreviews = make([]*common.LinkPreview, 0, len(urls))
@ -173,12 +228,6 @@ func (m *Messenger) UnfurlURLs(httpClient *http.Client, urls []string) (UnfurlUR
continue
}
// `AlwaysAsk` mode should be handled on the app side
// and is considered as equal to `EnableAll` in status-go.
if s.URLUnfurlingMode == settings.URLUnfurlingDisableAll {
continue
}
p, err := m.unfurlURL(httpClient, url)
if err != nil {
m.logger.Warn("failed to unfurl", zap.String("url", url), zap.Error(err))

View File

@ -8,6 +8,7 @@ import (
"net/http"
"net/url"
"regexp"
"strings"
"testing"
"time"
@ -186,7 +187,7 @@ func (s *MessengerLinkPreviewsTestSuite) Test_GetLinks() {
}
for _, ex := range examples {
links := GetURLs(ex.args)
links := s.m.GetURLs(ex.args)
s.Require().Equal(ex.expected, links, "Failed for args: '%s'", ex.args)
}
}
@ -606,79 +607,75 @@ func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_StatusCommunityJoined()
}
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_Settings() {
// Create website stub
ogLink := "https://github.com"
requestsCount := 0
const ogLink = "https://github.com"
const statusUserLink = "https://status.app/c#zQ3shYSHp7GoiXaauJMnDcjwU2yNjdzpXLosAWapPS4CFxc11"
const gifLink = "https://media1.giphy.com/media/lcG3qwtTKSNI2i5vst/giphy.gif"
transport := StubTransport{}
transport.AddURLMatcherRoundTrip(
ogLink,
func(req *http.Request) *http.Response {
requestsCount++
responseBody := []byte(`<html><head><meta property="og:title" content="TestTitle"></head></html>`)
return &http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewBuffer(responseBody)),
}
},
)
stubbedClient := http.Client{Transport: &transport}
// Add contact
identity, err := crypto.GenerateKey()
s.Require().NoError(err)
c, err := BuildContactFromPublicKey(&identity.PublicKey)
s.Require().NoError(err)
s.Require().NotNil(c)
c.Bio = "TestBio_1"
c.DisplayName = "TestDisplayName_2"
s.m.allContacts.Store(c.ID, c)
statusUserLink, err := s.m.ShareUserURLWithData(c.ID)
s.Require().NoError(err)
linksToUnfurl := []string{ogLink, statusUserLink}
linksToUnfurl := []string{ogLink, statusUserLink, gifLink}
text := strings.Join(linksToUnfurl, " ")
// Test `AlwaysAsk`
// NOTE: on status-go side `AlwaysAsk` == `EnableAll`, "asking" should be processed by the app
requestsCount = 0
err = s.m.settings.SaveSettingField(settings.URLUnfurlingMode, settings.URLUnfurlingAlwaysAsk)
err := s.m.settings.SaveSettingField(settings.URLUnfurlingMode, settings.URLUnfurlingAlwaysAsk)
s.Require().NoError(err)
linkPreviews, err := s.m.UnfurlURLs(&stubbedClient, linksToUnfurl)
s.Require().NoError(err)
s.Require().Len(linkPreviews.LinkPreviews, 1)
s.Require().Len(linkPreviews.StatusLinkPreviews, 1)
s.Require().Equal(requestsCount, 1)
plan := s.m.GetTextURLsToUnfurl(text)
s.Require().Len(plan.URLs, len(linksToUnfurl))
s.Require().Equal(plan.URLs[0].URL, ogLink)
s.Require().Equal(plan.URLs[0].IsStatusSharedURL, false)
s.Require().Equal(plan.URLs[0].Permission, URLUnfurlingAskUser)
s.Require().Equal(plan.URLs[1].URL, statusUserLink)
s.Require().Equal(plan.URLs[1].IsStatusSharedURL, true)
s.Require().Equal(plan.URLs[1].Permission, URLUnfurlingAllowed)
s.Require().Equal(plan.URLs[2].URL, gifLink)
s.Require().Equal(plan.URLs[2].IsStatusSharedURL, false)
s.Require().Equal(plan.URLs[2].Permission, URLUnfurlingNotSupported)
// Test `EnableAll`
requestsCount = 0
err = s.m.settings.SaveSettingField(settings.URLUnfurlingMode, settings.URLUnfurlingEnableAll)
s.Require().NoError(err)
linkPreviews, err = s.m.UnfurlURLs(&stubbedClient, linksToUnfurl)
s.Require().NoError(err)
s.Require().Len(linkPreviews.LinkPreviews, 1)
s.Require().Len(linkPreviews.StatusLinkPreviews, 1)
s.Require().Equal(requestsCount, 1)
plan = s.m.GetTextURLsToUnfurl(text)
s.Require().Len(plan.URLs, len(linksToUnfurl))
s.Require().Equal(plan.URLs[0].URL, ogLink)
s.Require().Equal(plan.URLs[0].IsStatusSharedURL, false)
s.Require().Equal(plan.URLs[0].Permission, URLUnfurlingAllowed)
s.Require().Equal(plan.URLs[1].URL, statusUserLink)
s.Require().Equal(plan.URLs[1].IsStatusSharedURL, true)
s.Require().Equal(plan.URLs[1].Permission, URLUnfurlingAllowed)
s.Require().Equal(plan.URLs[2].URL, gifLink)
s.Require().Equal(plan.URLs[2].IsStatusSharedURL, false)
s.Require().Equal(plan.URLs[2].Permission, URLUnfurlingNotSupported)
// Test `DisableAll`
requestsCount = 0
err = s.m.settings.SaveSettingField(settings.URLUnfurlingMode, settings.URLUnfurlingDisableAll)
s.Require().NoError(err)
linkPreviews, err = s.m.UnfurlURLs(&stubbedClient, linksToUnfurl)
s.Require().NoError(err)
s.Require().Len(linkPreviews.LinkPreviews, 0)
s.Require().Len(linkPreviews.StatusLinkPreviews, 1) // Status links are always unfurled
s.Require().Equal(requestsCount, 0)
plan = s.m.GetTextURLsToUnfurl(text)
s.Require().Len(plan.URLs, len(linksToUnfurl))
s.Require().Equal(plan.URLs[0].URL, ogLink)
s.Require().Equal(plan.URLs[0].IsStatusSharedURL, false)
s.Require().Equal(plan.URLs[0].Permission, URLUnfurlingForbiddenBySettings)
s.Require().Equal(plan.URLs[1].URL, statusUserLink)
s.Require().Equal(plan.URLs[1].IsStatusSharedURL, true)
s.Require().Equal(plan.URLs[1].Permission, URLUnfurlingAllowed)
s.Require().Equal(plan.URLs[2].URL, gifLink)
s.Require().Equal(plan.URLs[2].IsStatusSharedURL, false)
s.Require().Equal(plan.URLs[2].Permission, URLUnfurlingNotSupported)
}
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_Limit() {
linksToUnfurl := "https://www.youtube.com/watch?v=6dkDepLX0rk " +
text := "https://www.youtube.com/watch?v=6dkDepLX0rk " +
"https://www.youtube.com/watch?v=ferZnZ0_rSM " +
"https://www.youtube.com/watch?v=bdneye4pzMw " +
"https://www.youtube.com/watch?v=pRERgcQe-fQ " +
@ -686,6 +683,6 @@ func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_Limit() {
"https://www.youtube.com/watch?v=hxsJvKYyVyg " +
"https://www.youtube.com/watch?v=jIIuzB11dsA "
urls := GetURLs(linksToUnfurl)
urls := s.m.GetURLs(text)
s.Require().Equal(UnfurledLinksPerMessageLimit, len(urls))
}

View File

@ -1187,10 +1187,19 @@ func (api *PublicAPI) GetLinkPreviewData(link string) (previewData urls.LinkPrev
return urls.GetLinkPreviewData(link)
}
// GetTextURLsToUnfurl parses text and returns a deduplicated and (somewhat) normalized
// slice of URLs. The returned URLs can be used as cache keys by clients.
// For each URL there's a corresponding metadata which should be used as to plan the unfurling.
func (api *PublicAPI) GetTextURLsToUnfurl(text string) *protocol.URLsUnfurlPlan {
return api.service.messenger.GetTextURLsToUnfurl(text)
}
// Deprecated: GetTextURLs is deprecated in favor of more generic GetTextURLsToUnfurl.
//
// GetTextURLs parses text and returns a deduplicated and (somewhat) normalized
// slice of URLs. The returned URLs can be used as cache keys by clients.
func (api *PublicAPI) GetTextURLs(text string) []string {
return protocol.GetURLs(text)
return api.service.messenger.GetURLs(text)
}
// UnfurlURLs uses a best-effort approach to unfurl each URL. Failed URLs will