chore: Moved link preview unfurlers to a separate package (#3917)

* MessengerLinkPreviewsTestSuite
This commit is contained in:
Igor Sirotin 2023-08-21 22:31:32 +03:00 committed by GitHub
parent 084d4bac0c
commit 09a988607d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 641 additions and 549 deletions

View File

@ -1,469 +0,0 @@
package linkpreview
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
neturl "net/url"
"regexp"
"strings"
"time"
"github.com/keighl/metabolize"
"go.uber.org/zap"
"golang.org/x/net/publicsuffix"
"github.com/status-im/markdown"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/protocol/common"
"github.com/status-im/status-go/protocol/protobuf"
)
type LinkPreview struct {
common.LinkPreview
}
type Unfurler interface {
unfurl() (common.LinkPreview, error)
}
type Headers map[string]string
const (
defaultRequestTimeout = 15000 * time.Millisecond
maxImageSize = 1024 * 350
headerAcceptJSON = "application/json; charset=utf-8"
headerAcceptText = "text/html; charset=utf-8"
// Without a particular user agent, many providers treat status-go as a
// gluttony bot, and either respond more frequently with a 429 (Too Many
// Requests), or simply refuse to return valid data. Note that using a known
// browser UA doesn't work well with some providers, such as Spotify,
// apparently they still flag status-go as a bad actor.
headerUserAgent = "status-go/v0.151.15"
// Currently set to English, but we could make this setting dynamic according
// to the user's language of choice.
headerAcceptLanguage = "en-US,en;q=0.5"
)
var imageURLRegexp = regexp.MustCompile(`(?i)^.+(png|jpg|jpeg|webp)$`)
func fetchBody(logger *zap.Logger, httpClient http.Client, url string, headers Headers) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), defaultRequestTimeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to perform HTTP request: %w", err)
}
for k, v := range headers {
req.Header.Set(k, v)
}
res, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer func() {
if err := res.Body.Close(); err != nil {
logger.Error("failed to close response body", zap.Error(err))
}
}()
if res.StatusCode >= http.StatusBadRequest {
return nil, fmt.Errorf("http request failed, statusCode='%d'", res.StatusCode)
}
bodyBytes, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to read body bytes: %w", err)
}
return bodyBytes, nil
}
func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview {
return common.LinkPreview{
URL: url.String(),
Hostname: url.Hostname(),
}
}
func fetchThumbnail(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreviewThumbnail, error) {
var thumbnail common.LinkPreviewThumbnail
imgBytes, err := fetchBody(logger, httpClient, url, nil)
if err != nil {
return thumbnail, fmt.Errorf("could not fetch thumbnail url='%s': %w", url, err)
}
width, height, err := images.GetImageDimensions(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not get image dimensions url='%s': %w", url, err)
}
thumbnail.Width = width
thumbnail.Height = height
dataURI, err := images.GetPayloadDataURI(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not build data URI url='%s': %w", url, err)
}
thumbnail.DataURI = dataURI
return thumbnail, nil
}
type OEmbedUnfurler struct {
logger *zap.Logger
httpClient http.Client
// oembedEndpoint describes where the consumer may request representations for
// the supported URL scheme. For example, for YouTube, it is
// https://www.youtube.com/oembed.
oembedEndpoint string
// url is the actual URL to be unfurled.
url *neturl.URL
}
type OEmbedResponse struct {
Title string `json:"title"`
ThumbnailURL string `json:"thumbnail_url"`
}
func (u OEmbedUnfurler) newOEmbedURL() (*neturl.URL, error) {
oembedURL, err := neturl.Parse(u.oembedEndpoint)
if err != nil {
return nil, err
}
// When format is specified, the provider MUST return data in the requested
// format, else return an error.
oembedURL.RawQuery = neturl.Values{
"url": {u.url.String()},
"format": {"json"},
}.Encode()
return oembedURL, nil
}
func (u OEmbedUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
preview.Type = protobuf.UnfurledLink_LINK
oembedURL, err := u.newOEmbedURL()
if err != nil {
return preview, err
}
headers := map[string]string{
"accept": headerAcceptJSON,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
oembedBytes, err := fetchBody(u.logger, u.httpClient, oembedURL.String(), headers)
if err != nil {
return preview, err
}
var oembedResponse OEmbedResponse
if err != nil {
return preview, err
}
err = json.Unmarshal(oembedBytes, &oembedResponse)
if err != nil {
return preview, err
}
if oembedResponse.Title == "" {
return preview, fmt.Errorf("missing required title in oEmbed response")
}
preview.Title = oembedResponse.Title
return preview, nil
}
type OpenGraphMetadata struct {
Title string `json:"title" meta:"og:title"`
Description string `json:"description" meta:"og:description"`
ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"`
}
// OpenGraphUnfurler should be preferred over OEmbedUnfurler because oEmbed
// gives back a JSON response with a "html" field that's supposed to be embedded
// in an iframe (hardly useful for existing Status' clients).
type OpenGraphUnfurler struct {
url *neturl.URL
logger *zap.Logger
httpClient http.Client
}
func (u OpenGraphUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
preview.Type = protobuf.UnfurledLink_LINK
headers := map[string]string{
"accept": headerAcceptText,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
bodyBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
if err != nil {
return preview, err
}
var ogMetadata OpenGraphMetadata
err = metabolize.Metabolize(ioutil.NopCloser(bytes.NewBuffer(bodyBytes)), &ogMetadata)
if err != nil {
return preview, fmt.Errorf("failed to parse OpenGraph data")
}
// There are URLs like https://wikipedia.org/ that don't have an OpenGraph
// title tag, but article pages do. In the future, we can fallback to the
// website's title by using the <title> tag.
if ogMetadata.Title == "" {
return preview, fmt.Errorf("missing required title in OpenGraph response")
}
if ogMetadata.ThumbnailURL != "" {
t, err := fetchThumbnail(u.logger, u.httpClient, ogMetadata.ThumbnailURL)
if err != nil {
// Given we want to fetch thumbnails on a best-effort basis, if an error
// happens we simply log it.
u.logger.Info("failed to fetch thumbnail", zap.String("url", u.url.String()), zap.Error(err))
} else {
preview.Thumbnail = t
}
}
preview.Title = ogMetadata.Title
preview.Description = ogMetadata.Description
return preview, nil
}
type ImageUnfurler struct {
url *neturl.URL
logger *zap.Logger
httpClient http.Client
}
func compressImage(imgBytes []byte) ([]byte, error) {
smallest := imgBytes
img, err := images.DecodeImageData(imgBytes, bytes.NewReader(imgBytes))
if err != nil {
return nil, err
}
compressed := bytes.NewBuffer([]byte{})
err = images.CompressToFileLimits(compressed, img, images.DefaultBounds)
if err != nil {
return nil, err
}
if len(compressed.Bytes()) < len(smallest) {
smallest = compressed.Bytes()
}
if len(smallest) > maxImageSize {
return nil, errors.New("image too large")
}
return smallest, nil
}
func (u ImageUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
preview.Type = protobuf.UnfurledLink_IMAGE
headers := map[string]string{"user-agent": headerUserAgent}
imgBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
if err != nil {
return preview, err
}
if !isSupportedImage(imgBytes) {
return preview, fmt.Errorf("unsupported image type url='%s'", u.url.String())
}
compressedBytes, err := compressImage(imgBytes)
if err != nil {
return preview, fmt.Errorf("failed to compress image url='%s': %w", u.url.String(), err)
}
width, height, err := images.GetImageDimensions(compressedBytes)
if err != nil {
return preview, fmt.Errorf("could not get image dimensions url='%s': %w", u.url.String(), err)
}
dataURI, err := images.GetPayloadDataURI(compressedBytes)
if err != nil {
return preview, fmt.Errorf("could not build data URI url='%s': %w", u.url.String(), err)
}
preview.Thumbnail.Width = width
preview.Thumbnail.Height = height
preview.Thumbnail.DataURI = dataURI
return preview, nil
}
func normalizeHostname(hostname string) string {
hostname = strings.ToLower(hostname)
re := regexp.MustCompile(`^www\.(.*)$`)
return re.ReplaceAllString(hostname, "$1")
}
// isSupportedImageURL detects whether a URL ends with one of the
// supported image extensions. It provides a quick way to identify whether URLs
// should be unfurled as images without needing to retrieve the full response
// body first.
func isSupportedImageURL(url *neturl.URL) bool {
return imageURLRegexp.MatchString(url.Path)
}
// isSupportedImage returns true when payload is one of the supported image
// types. In the future, we should differentiate between animated and
// non-animated WebP because, currently, only static WebP can be processed by
// functions in the status-go/images package.
func isSupportedImage(payload []byte) bool {
return images.IsJpeg(payload) || images.IsPng(payload) || images.IsWebp(payload)
}
func newUnfurler(logger *zap.Logger, httpClient http.Client, url *neturl.URL) Unfurler {
if isSupportedImageURL(url) {
return ImageUnfurler{
url: url,
logger: logger,
httpClient: httpClient,
}
}
switch normalizeHostname(url.Hostname()) {
case "reddit.com":
return OEmbedUnfurler{
oembedEndpoint: "https://www.reddit.com/oembed",
url: url,
logger: logger,
httpClient: httpClient,
}
default:
return OpenGraphUnfurler{
url: url,
logger: logger,
httpClient: httpClient,
}
}
}
func unfurl(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreview, error) {
var preview common.LinkPreview
parsedURL, err := neturl.Parse(url)
if err != nil {
return preview, err
}
unfurler := newUnfurler(logger, httpClient, parsedURL)
preview, err = unfurler.unfurl()
if err != nil {
return preview, err
}
preview.Hostname = strings.ToLower(parsedURL.Hostname())
return preview, nil
}
// parseValidURL is a stricter version of url.Parse that performs additional
// checks to ensure the URL is valid for clients to request a link preview.
func parseValidURL(rawURL string) (*neturl.URL, error) {
u, err := neturl.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("parsing URL failed: %w", err)
}
if u.Scheme == "" {
return nil, errors.New("missing URL scheme")
}
_, err = publicsuffix.EffectiveTLDPlusOne(u.Hostname())
if err != nil {
return nil, fmt.Errorf("missing known URL domain: %w", err)
}
return u, nil
}
// GetURLs returns only what we consider unfurleable URLs.
//
// If we wanted to be extra precise and help improve UX, we could ignore URLs
// that we know can't be unfurled. This is at least possible with the oEmbed
// protocol because providers must specify an endpoint scheme.
func GetURLs(text string) []string {
parsedText := markdown.Parse([]byte(text), nil)
visitor := common.RunLinksVisitor(parsedText)
urls := make([]string, 0, len(visitor.Links))
indexed := make(map[string]any, len(visitor.Links))
for _, rawURL := range visitor.Links {
parsedURL, err := parseValidURL(rawURL)
if err != nil {
continue
}
// Lowercase the host so the URL can be used as a cache key. Particularly on
// mobile clients it is common that the first character in a text input is
// automatically uppercased. In WhatsApp they incorrectly lowercase the
// URL's path, but this is incorrect. For instance, some URL shorteners are
// case-sensitive, some websites encode base64 in the path, etc.
parsedURL.Host = strings.ToLower(parsedURL.Host)
idx := parsedURL.String()
// Removes the spurious trailing forward slash.
idx = strings.TrimRight(idx, "/")
if _, exists := indexed[idx]; exists {
continue
} else {
indexed[idx] = nil
urls = append(urls, idx)
}
}
return urls
}
func NewDefaultHTTPClient() http.Client {
return http.Client{Timeout: defaultRequestTimeout}
}
// UnfurlURLs assumes clients pass URLs verbatim that were validated and
// processed by GetURLs.
func UnfurlURLs(logger *zap.Logger, httpClient http.Client, urls []string) ([]common.LinkPreview, error) {
var err error
if logger == nil {
logger, err = zap.NewDevelopment()
if err != nil {
return nil, fmt.Errorf("failed to create logger: %w", err)
}
}
previews := make([]common.LinkPreview, 0, len(urls))
for _, url := range urls {
logger.Debug("unfurling", zap.String("url", url))
p, err := unfurl(logger, httpClient, url)
if err != nil {
logger.Info("failed to unfurl", zap.String("url", url), zap.Error(err))
continue
}
previews = append(previews, p)
}
return previews, nil
}

View File

@ -0,0 +1,113 @@
package unfurlers
import (
"bytes"
"errors"
"fmt"
"net/http"
neturl "net/url"
"regexp"
"go.uber.org/zap"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/protocol/common"
"github.com/status-im/status-go/protocol/protobuf"
)
const (
maxImageSize = 1024 * 350
)
var imageURLRegexp = regexp.MustCompile(`(?i)^.+(png|jpg|jpeg|webp)$`)
type ImageUnfurler struct {
url *neturl.URL
logger *zap.Logger
httpClient *http.Client
}
func NewImageUnfurler(URL *neturl.URL, logger *zap.Logger, httpClient *http.Client) *ImageUnfurler {
return &ImageUnfurler{
url: URL,
logger: logger,
httpClient: httpClient,
}
}
func compressImage(imgBytes []byte) ([]byte, error) {
smallest := imgBytes
img, err := images.DecodeImageData(imgBytes, bytes.NewReader(imgBytes))
if err != nil {
return nil, err
}
compressed := bytes.NewBuffer([]byte{})
err = images.CompressToFileLimits(compressed, img, images.DefaultBounds)
if err != nil {
return nil, err
}
if len(compressed.Bytes()) < len(smallest) {
smallest = compressed.Bytes()
}
if len(smallest) > maxImageSize {
return nil, errors.New("image too large")
}
return smallest, nil
}
// IsSupportedImageURL detects whether a URL ends with one of the
// supported image extensions. It provides a quick way to identify whether URLs
// should be unfurled as images without needing to retrieve the full response
// body first.
func IsSupportedImageURL(url *neturl.URL) bool {
return imageURLRegexp.MatchString(url.Path)
}
// isSupportedImage returns true when payload is one of the supported image
// types. In the future, we should differentiate between animated and
// non-animated WebP because, currently, only static WebP can be processed by
// functions in the status-go/images package.
func isSupportedImage(payload []byte) bool {
return images.IsJpeg(payload) || images.IsPng(payload) || images.IsWebp(payload)
}
func (u *ImageUnfurler) Unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
preview.Type = protobuf.UnfurledLink_IMAGE
headers := map[string]string{"user-agent": headerUserAgent}
imgBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
if err != nil {
return preview, err
}
if !isSupportedImage(imgBytes) {
return preview, fmt.Errorf("unsupported image type url='%s'", u.url.String())
}
compressedBytes, err := compressImage(imgBytes)
if err != nil {
return preview, fmt.Errorf("failed to compress image url='%s': %w", u.url.String(), err)
}
width, height, err := images.GetImageDimensions(compressedBytes)
if err != nil {
return preview, fmt.Errorf("could not get image dimensions url='%s': %w", u.url.String(), err)
}
dataURI, err := images.GetPayloadDataURI(compressedBytes)
if err != nil {
return preview, fmt.Errorf("could not build data URI url='%s': %w", u.url.String(), err)
}
preview.Thumbnail.Width = width
preview.Thumbnail.Height = height
preview.Thumbnail.DataURI = dataURI
return preview, nil
}

View File

@ -0,0 +1,93 @@
package unfurlers
import (
"encoding/json"
"fmt"
"net/http"
neturl "net/url"
"go.uber.org/zap"
"github.com/status-im/status-go/protocol/common"
"github.com/status-im/status-go/protocol/protobuf"
)
type OEmbedUnfurler struct {
logger *zap.Logger
httpClient *http.Client
// oembedEndpoint describes where the consumer may request representations for
// the supported URL scheme. For example, for YouTube, it is
// https://www.youtube.com/oembed.
oembedEndpoint string
// url is the actual URL to be unfurled.
url *neturl.URL
}
func NewOEmbedUnfurler(oembedEndpoint string,
url *neturl.URL,
logger *zap.Logger,
httpClient *http.Client) *OEmbedUnfurler {
return &OEmbedUnfurler{
oembedEndpoint: oembedEndpoint,
url: url,
logger: logger,
httpClient: httpClient,
}
}
type OEmbedResponse struct {
Title string `json:"title"`
ThumbnailURL string `json:"thumbnail_url"`
}
func (u *OEmbedUnfurler) newOEmbedURL() (*neturl.URL, error) {
oembedURL, err := neturl.Parse(u.oembedEndpoint)
if err != nil {
return nil, err
}
// When format is specified, the provider MUST return data in the requested
// format, else return an error.
oembedURL.RawQuery = neturl.Values{
"url": {u.url.String()},
"format": {"json"},
}.Encode()
return oembedURL, nil
}
func (u OEmbedUnfurler) Unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
preview.Type = protobuf.UnfurledLink_LINK
oembedURL, err := u.newOEmbedURL()
if err != nil {
return preview, err
}
headers := map[string]string{
"accept": headerAcceptJSON,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
oembedBytes, err := fetchBody(u.logger, u.httpClient, oembedURL.String(), headers)
if err != nil {
return preview, err
}
var oembedResponse OEmbedResponse
if err != nil {
return preview, err
}
err = json.Unmarshal(oembedBytes, &oembedResponse)
if err != nil {
return preview, err
}
if oembedResponse.Title == "" {
return preview, fmt.Errorf("missing required title in oEmbed response")
}
preview.Title = oembedResponse.Title
return preview, nil
}

View File

@ -0,0 +1,106 @@
package unfurlers
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
neturl "net/url"
"github.com/keighl/metabolize"
"go.uber.org/zap"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/protocol/common"
"github.com/status-im/status-go/protocol/protobuf"
)
type OpenGraphMetadata struct {
Title string `json:"title" meta:"og:title"`
Description string `json:"description" meta:"og:description"`
ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"`
}
// OpenGraphUnfurler should be preferred over OEmbedUnfurler because oEmbed
// gives back a JSON response with a "html" field that's supposed to be embedded
// in an iframe (hardly useful for existing Status' clients).
type OpenGraphUnfurler struct {
url *neturl.URL
logger *zap.Logger
httpClient *http.Client
}
func NewOpenGraphUnfurler(URL *neturl.URL, logger *zap.Logger, httpClient *http.Client) *OpenGraphUnfurler {
return &OpenGraphUnfurler{
url: URL,
logger: logger,
httpClient: httpClient,
}
}
func (u *OpenGraphUnfurler) Unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
preview.Type = protobuf.UnfurledLink_LINK
headers := map[string]string{
"accept": headerAcceptText,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
bodyBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
if err != nil {
return preview, err
}
var ogMetadata OpenGraphMetadata
err = metabolize.Metabolize(ioutil.NopCloser(bytes.NewBuffer(bodyBytes)), &ogMetadata)
if err != nil {
return preview, fmt.Errorf("failed to parse OpenGraph data")
}
// There are URLs like https://wikipedia.org/ that don't have an OpenGraph
// title tag, but article pages do. In the future, we can fallback to the
// website's title by using the <title> tag.
if ogMetadata.Title == "" {
return preview, fmt.Errorf("missing required title in OpenGraph response")
}
if ogMetadata.ThumbnailURL != "" {
t, err := fetchThumbnail(u.logger, u.httpClient, ogMetadata.ThumbnailURL)
if err != nil {
// Given we want to fetch thumbnails on a best-effort basis, if an error
// happens we simply log it.
u.logger.Info("failed to fetch thumbnail", zap.String("url", u.url.String()), zap.Error(err))
} else {
preview.Thumbnail = t
}
}
preview.Title = ogMetadata.Title
preview.Description = ogMetadata.Description
return preview, nil
}
func fetchThumbnail(logger *zap.Logger, httpClient *http.Client, url string) (common.LinkPreviewThumbnail, error) {
var thumbnail common.LinkPreviewThumbnail
imgBytes, err := fetchBody(logger, httpClient, url, nil)
if err != nil {
return thumbnail, fmt.Errorf("could not fetch thumbnail url='%s': %w", url, err)
}
width, height, err := images.GetImageDimensions(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not get image dimensions url='%s': %w", url, err)
}
thumbnail.Width = width
thumbnail.Height = height
dataURI, err := images.GetPayloadDataURI(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not build data URI url='%s': %w", url, err)
}
thumbnail.DataURI = dataURI
return thumbnail, nil
}

View File

@ -0,0 +1,80 @@
package unfurlers
import (
"context"
"fmt"
"io/ioutil"
"net/http"
neturl "net/url"
"time"
"go.uber.org/zap"
"github.com/status-im/status-go/protocol/common"
)
const (
DefaultRequestTimeout = 15000 * time.Millisecond
headerAcceptJSON = "application/json; charset=utf-8"
headerAcceptText = "text/html; charset=utf-8"
// Without a particular user agent, many providers treat status-go as a
// gluttony bot, and either respond more frequently with a 429 (Too Many
// Requests), or simply refuse to return valid data. Note that using a known
// browser UA doesn't work well with some providers, such as Spotify,
// apparently they still flag status-go as a bad actor.
headerUserAgent = "status-go/v0.151.15"
// Currently set to English, but we could make this setting dynamic according
// to the user's language of choice.
headerAcceptLanguage = "en-US,en;q=0.5"
)
type Headers map[string]string
type Unfurler interface {
Unfurl() (common.LinkPreview, error)
}
func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview {
return common.LinkPreview{
URL: url.String(),
Hostname: url.Hostname(),
}
}
func fetchBody(logger *zap.Logger, httpClient *http.Client, url string, headers Headers) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), DefaultRequestTimeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to perform HTTP request: %w", err)
}
for k, v := range headers {
req.Header.Set(k, v)
}
res, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer func() {
if err := res.Body.Close(); err != nil {
logger.Error("failed to close response body", zap.Error(err))
}
}()
if res.StatusCode >= http.StatusBadRequest {
return nil, fmt.Errorf("http request failed, statusCode='%d'", res.StatusCode)
}
bodyBytes, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to read body bytes: %w", err)
}
return bodyBytes, nil
}

View File

@ -49,7 +49,6 @@ import (
"github.com/status-im/status-go/protocol/identity"
"github.com/status-im/status-go/protocol/identity/alias"
"github.com/status-im/status-go/protocol/identity/identicon"
"github.com/status-im/status-go/protocol/linkpreview"
"github.com/status-im/status-go/protocol/protobuf"
"github.com/status-im/status-go/protocol/pushnotificationclient"
"github.com/status-im/status-go/protocol/pushnotificationserver"
@ -6139,10 +6138,6 @@ func generateAliasAndIdenticon(pk string) (string, string, error) {
}
func (m *Messenger) UnfurlURLs(urls []string) ([]common.LinkPreview, error) {
return linkpreview.UnfurlURLs(m.logger, linkpreview.NewDefaultHTTPClient(), urls)
}
func (m *Messenger) SendEmojiReaction(ctx context.Context, chatID, messageID string, emojiID protobuf.EmojiReaction_Type) (*MessengerResponse, error) {
var response MessengerResponse

View File

@ -0,0 +1,152 @@
package protocol
import (
"errors"
"fmt"
"net/http"
neturl "net/url"
"regexp"
"strings"
"github.com/status-im/markdown"
"go.uber.org/zap"
"golang.org/x/net/publicsuffix"
"github.com/status-im/status-go/protocol/common"
"github.com/status-im/status-go/protocol/linkpreview/unfurlers"
)
type LinkPreview struct {
common.LinkPreview
}
func normalizeHostname(hostname string) string {
hostname = strings.ToLower(hostname)
re := regexp.MustCompile(`^www\.(.*)$`)
return re.ReplaceAllString(hostname, "$1")
}
func (m *Messenger) newURLUnfurler(httpClient *http.Client, url *neturl.URL) unfurlers.Unfurler {
if unfurlers.IsSupportedImageURL(url) {
return unfurlers.NewImageUnfurler(
url,
m.logger,
httpClient)
}
switch normalizeHostname(url.Hostname()) {
case "reddit.com":
return unfurlers.NewOEmbedUnfurler(
"https://www.reddit.com/oembed",
url,
m.logger,
httpClient)
default:
return unfurlers.NewOpenGraphUnfurler(
url,
m.logger,
httpClient)
}
}
func (m *Messenger) unfurlURL(httpClient *http.Client, url string) (common.LinkPreview, error) {
var preview common.LinkPreview
parsedURL, err := neturl.Parse(url)
if err != nil {
return preview, err
}
unfurler := m.newURLUnfurler(httpClient, parsedURL)
preview, err = unfurler.Unfurl()
if err != nil {
return preview, err
}
preview.Hostname = strings.ToLower(parsedURL.Hostname())
return preview, nil
}
// parseValidURL is a stricter version of url.Parse that performs additional
// checks to ensure the URL is valid for clients to request a link preview.
func parseValidURL(rawURL string) (*neturl.URL, error) {
u, err := neturl.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("parsing URL failed: %w", err)
}
if u.Scheme == "" {
return nil, errors.New("missing URL scheme")
}
_, err = publicsuffix.EffectiveTLDPlusOne(u.Hostname())
if err != nil {
return nil, fmt.Errorf("missing known URL domain: %w", err)
}
return u, nil
}
// GetURLs returns only what we consider unfurleable URLs.
//
// If we wanted to be extra precise and help improve UX, we could ignore URLs
// that we know can't be unfurled. This is at least possible with the oEmbed
// protocol because providers must specify an endpoint scheme.
func GetURLs(text string) []string {
parsedText := markdown.Parse([]byte(text), nil)
visitor := common.RunLinksVisitor(parsedText)
urls := make([]string, 0, len(visitor.Links))
indexed := make(map[string]any, len(visitor.Links))
for _, rawURL := range visitor.Links {
parsedURL, err := parseValidURL(rawURL)
if err != nil {
continue
}
// Lowercase the host so the URL can be used as a cache key. Particularly on
// mobile clients it is common that the first character in a text input is
// automatically uppercased. In WhatsApp they incorrectly lowercase the
// URL's path, but this is incorrect. For instance, some URL shorteners are
// case-sensitive, some websites encode base64 in the path, etc.
parsedURL.Host = strings.ToLower(parsedURL.Host)
idx := parsedURL.String()
// Removes the spurious trailing forward slash.
idx = strings.TrimRight(idx, "/")
if _, exists := indexed[idx]; exists {
continue
} else {
indexed[idx] = nil
urls = append(urls, idx)
}
}
return urls
}
func NewDefaultHTTPClient() *http.Client {
return &http.Client{Timeout: unfurlers.DefaultRequestTimeout}
}
// UnfurlURLs assumes clients pass URLs verbatim that were validated and
// processed by GetURLs.
func (m *Messenger) UnfurlURLs(httpClient *http.Client, urls []string) ([]common.LinkPreview, error) {
if httpClient == nil {
httpClient = NewDefaultHTTPClient()
}
previews := make([]common.LinkPreview, 0, len(urls))
for _, url := range urls {
m.logger.Debug("unfurling", zap.String("url", url))
p, err := m.unfurlURL(httpClient, url)
if err != nil {
m.logger.Info("failed to unfurl", zap.String("url", url), zap.Error(err))
continue
}
previews = append(previews, p)
}
return previews, nil
}

View File

@ -1,4 +1,4 @@
package linkpreview
package protocol
import (
"bytes"
@ -11,12 +11,36 @@ import (
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
"github.com/status-im/status-go/protocol/common"
"github.com/status-im/status-go/protocol/linkpreview/unfurlers"
"github.com/status-im/status-go/protocol/protobuf"
)
func TestMessengerLinkPreviews(t *testing.T) {
suite.Run(t, new(MessengerLinkPreviewsTestSuite))
}
type MessengerLinkPreviewsTestSuite struct {
MessengerBaseTestSuite
}
//func (s *MessengerLinkPreviewsTestSuite) SetupTest() {
// s.logger = tt.MustCreateTestLogger()
//
// c := waku.DefaultConfig
// c.MinimumAcceptedPoW = 0
// shh := waku.New(&c, s.logger)
// s.shh = gethbridge.NewGethWakuWrapper(shh)
// s.Require().NoError(shh.Start())
//
// s.m = s.newMessenger()
// s.privateKey = s.m.identity
// _, err := s.m.Start()
// s.Require().NoError(err)
//}
// StubMatcher should either return an http.Response or nil in case the request
// doesn't match.
type StubMatcher func(req *http.Request) *http.Response
@ -92,7 +116,7 @@ func (t *StubTransport) AddURLMatcher(urlRegexp string, responseBody []byte, hea
// correctly prints the cause of the failure. The default behavior of
// require.Contains with long strings is to not print the formatted message
// (varargs to require.Contains).
func assertContainsLongString(t *testing.T, expected string, actual string, maxLength int) {
func (s *MessengerLinkPreviewsTestSuite) assertContainsLongString(expected string, actual string, maxLength int) {
var safeIdx float64
var actualShort string
var expectedShort string
@ -107,8 +131,7 @@ func assertContainsLongString(t *testing.T, expected string, actual string, maxL
expectedShort = expected[:int(safeIdx)]
}
require.Contains(
t,
s.Require().Contains(
actual, expected,
"'%s' should contain '%s'",
actualShort,
@ -116,7 +139,7 @@ func assertContainsLongString(t *testing.T, expected string, actual string, maxL
)
}
func Test_GetLinks(t *testing.T) {
func (s *MessengerLinkPreviewsTestSuite) Test_GetLinks() {
examples := []struct {
args string
expected []string
@ -163,22 +186,22 @@ func Test_GetLinks(t *testing.T) {
for _, ex := range examples {
links := GetURLs(ex.args)
require.Equal(t, ex.expected, links, "Failed for args: '%s'", ex.args)
s.Require().Equal(ex.expected, links, "Failed for args: '%s'", ex.args)
}
}
func readAsset(t *testing.T, filename string) []byte {
b, err := ioutil.ReadFile("../../_assets/tests/" + filename)
require.NoError(t, err)
func (s *MessengerLinkPreviewsTestSuite) readAsset(filename string) []byte {
b, err := ioutil.ReadFile("../_assets/tests/" + filename)
s.Require().NoError(err)
return b
}
func Test_UnfurlURLs_YouTube(t *testing.T) {
url := "https://www.youtube.com/watch?v=lE4UXdJSJM4"
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_YouTube() {
u := "https://www.youtube.com/watch?v=lE4UXdJSJM4"
thumbnailURL := "https://i.ytimg.com/vi/lE4UXdJSJM4/maxresdefault.jpg"
expected := common.LinkPreview{
Type: protobuf.UnfurledLink_LINK,
URL: url,
URL: u,
Hostname: "www.youtube.com",
Title: "Interview with a GNU/Linux user - Partition 1",
Description: "GNU/Linux Operating SystemInterview with a GNU/Linux user with Richie Guix - aired on © The GNU Linux.Programmer humorLinux humorProgramming jokesProgramming...",
@ -191,7 +214,7 @@ func Test_UnfurlURLs_YouTube(t *testing.T) {
transport := StubTransport{}
transport.AddURLMatcher(
url,
u,
[]byte(fmt.Sprintf(`
<html>
<head>
@ -203,30 +226,30 @@ func Test_UnfurlURLs_YouTube(t *testing.T) {
`, expected.Title, expected.Description, thumbnailURL)),
nil,
)
transport.AddURLMatcher(thumbnailURL, readAsset(t, "1.jpg"), nil)
transport.AddURLMatcher(thumbnailURL, s.readAsset("1.jpg"), nil)
stubbedClient := http.Client{Transport: &transport}
previews, err := UnfurlURLs(nil, stubbedClient, []string{url})
require.NoError(t, err)
require.Len(t, previews, 1)
previews, err := s.m.UnfurlURLs(&stubbedClient, []string{u})
s.Require().NoError(err)
s.Require().Len(previews, 1)
preview := previews[0]
require.Equal(t, expected.Type, preview.Type)
require.Equal(t, expected.URL, preview.URL)
require.Equal(t, expected.Hostname, preview.Hostname)
require.Equal(t, expected.Title, preview.Title)
require.Equal(t, expected.Description, preview.Description)
require.Equal(t, expected.Thumbnail.Width, preview.Thumbnail.Width)
require.Equal(t, expected.Thumbnail.Height, preview.Thumbnail.Height)
require.Equal(t, expected.Thumbnail.URL, preview.Thumbnail.URL)
assertContainsLongString(t, expected.Thumbnail.DataURI, preview.Thumbnail.DataURI, 100)
s.Require().Equal(expected.Type, preview.Type)
s.Require().Equal(expected.URL, preview.URL)
s.Require().Equal(expected.Hostname, preview.Hostname)
s.Require().Equal(expected.Title, preview.Title)
s.Require().Equal(expected.Description, preview.Description)
s.Require().Equal(expected.Thumbnail.Width, preview.Thumbnail.Width)
s.Require().Equal(expected.Thumbnail.Height, preview.Thumbnail.Height)
s.Require().Equal(expected.Thumbnail.URL, preview.Thumbnail.URL)
s.assertContainsLongString(expected.Thumbnail.DataURI, preview.Thumbnail.DataURI, 100)
}
func Test_UnfurlURLs_Reddit(t *testing.T) {
url := "https://www.reddit.com/r/Bitcoin/comments/13j0tzr/the_best_bitcoin_explanation_of_all_times/?utm_source=share"
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_Reddit() {
u := "https://www.reddit.com/r/Bitcoin/comments/13j0tzr/the_best_bitcoin_explanation_of_all_times/?utm_source=share"
expected := common.LinkPreview{
Type: protobuf.UnfurledLink_LINK,
URL: url,
URL: u,
Hostname: "www.reddit.com",
Title: "The best bitcoin explanation of all times.",
Description: "",
@ -250,27 +273,27 @@ func Test_UnfurlURLs_Reddit(t *testing.T) {
)
stubbedClient := http.Client{Transport: &transport}
previews, err := UnfurlURLs(nil, stubbedClient, []string{url})
require.NoError(t, err)
require.Len(t, previews, 1)
previews, err := s.m.UnfurlURLs(&stubbedClient, []string{u})
s.Require().NoError(err)
s.Require().Len(previews, 1)
preview := previews[0]
require.Equal(t, expected.Type, preview.Type)
require.Equal(t, expected.URL, preview.URL)
require.Equal(t, expected.Hostname, preview.Hostname)
require.Equal(t, expected.Title, preview.Title)
require.Equal(t, expected.Description, preview.Description)
require.Equal(t, expected.Thumbnail, preview.Thumbnail)
s.Require().Equal(expected.Type, preview.Type)
s.Require().Equal(expected.URL, preview.URL)
s.Require().Equal(expected.Hostname, preview.Hostname)
s.Require().Equal(expected.Title, preview.Title)
s.Require().Equal(expected.Description, preview.Description)
s.Require().Equal(expected.Thumbnail, preview.Thumbnail)
}
func Test_UnfurlURLs_Timeout(t *testing.T) {
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_Timeout() {
httpClient := http.Client{Timeout: time.Nanosecond}
previews, err := UnfurlURLs(nil, httpClient, []string{"https://status.im"})
require.NoError(t, err)
require.Empty(t, previews)
previews, err := s.m.UnfurlURLs(&httpClient, []string{"https://status.im"})
s.Require().NoError(err)
s.Require().Empty(previews)
}
func Test_UnfurlURLs_CommonFailures(t *testing.T) {
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_CommonFailures() {
httpClient := http.Client{}
// Test URL that doesn't return any OpenGraph title.
@ -281,22 +304,22 @@ func Test_UnfurlURLs_CommonFailures(t *testing.T) {
nil,
)
stubbedClient := http.Client{Transport: &transport}
previews, err := UnfurlURLs(nil, stubbedClient, []string{"https://wikipedia.org"})
require.NoError(t, err)
require.Empty(t, previews)
previews, err := s.m.UnfurlURLs(&stubbedClient, []string{"https://wikipedia.org"})
s.Require().NoError(err)
s.Require().Empty(previews)
// Test 404.
previews, err = UnfurlURLs(nil, httpClient, []string{"https://github.com/status-im/i_do_not_exist"})
require.NoError(t, err)
require.Empty(t, previews)
previews, err = s.m.UnfurlURLs(&httpClient, []string{"https://github.com/status-im/i_do_not_exist"})
s.Require().NoError(err)
s.Require().Empty(previews)
// Test no response when trying to get OpenGraph metadata.
previews, err = UnfurlURLs(nil, httpClient, []string{"https://wikipedia.o"})
require.NoError(t, err)
require.Empty(t, previews)
previews, err = s.m.UnfurlURLs(&httpClient, []string{"https://wikipedia.o"})
s.Require().NoError(err)
s.Require().Empty(previews)
}
func Test_isSupportedImageURL(t *testing.T) {
func (s *MessengerLinkPreviewsTestSuite) Test_isSupportedImageURL() {
examples := []struct {
url string
expected bool
@ -315,16 +338,16 @@ func Test_isSupportedImageURL(t *testing.T) {
for _, e := range examples {
parsedURL, err := url.Parse(e.url)
require.NoError(t, err, e)
require.Equal(t, e.expected, isSupportedImageURL(parsedURL), e.url)
s.Require().NoError(err, e)
s.Require().Equal(e.expected, unfurlers.IsSupportedImageURL(parsedURL), e.url)
}
}
func Test_UnfurlURLs_Image(t *testing.T) {
url := "https://placehold.co/600x400@3x.png"
func (s *MessengerLinkPreviewsTestSuite) Test_UnfurlURLs_Image() {
u := "https://placehold.co/600x400@3x.png"
expected := common.LinkPreview{
Type: protobuf.UnfurledLink_IMAGE,
URL: url,
URL: u,
Hostname: "placehold.co",
Title: "",
Description: "",
@ -337,21 +360,21 @@ func Test_UnfurlURLs_Image(t *testing.T) {
transport := StubTransport{}
// Use a larger image to verify Thumbnail.DataURI is compressed.
transport.AddURLMatcher(url, readAsset(t, "IMG_1205.HEIC.jpg"), nil)
transport.AddURLMatcher(u, s.readAsset("IMG_1205.HEIC.jpg"), nil)
stubbedClient := http.Client{Transport: &transport}
previews, err := UnfurlURLs(nil, stubbedClient, []string{url})
require.NoError(t, err)
require.Len(t, previews, 1)
previews, err := s.m.UnfurlURLs(&stubbedClient, []string{u})
s.Require().NoError(err)
s.Require().Len(previews, 1)
preview := previews[0]
require.Equal(t, expected.Type, preview.Type)
require.Equal(t, expected.URL, preview.URL)
require.Equal(t, expected.Hostname, preview.Hostname)
require.Equal(t, expected.Title, preview.Title)
require.Equal(t, expected.Description, preview.Description)
require.Equal(t, expected.Thumbnail.Width, preview.Thumbnail.Width)
require.Equal(t, expected.Thumbnail.Height, preview.Thumbnail.Height)
require.Equal(t, expected.Thumbnail.URL, preview.Thumbnail.URL)
assertContainsLongString(t, expected.Thumbnail.DataURI, preview.Thumbnail.DataURI, 100)
s.Require().Equal(expected.Type, preview.Type)
s.Require().Equal(expected.URL, preview.URL)
s.Require().Equal(expected.Hostname, preview.Hostname)
s.Require().Equal(expected.Title, preview.Title)
s.Require().Equal(expected.Description, preview.Description)
s.Require().Equal(expected.Thumbnail.Width, preview.Thumbnail.Width)
s.Require().Equal(expected.Thumbnail.Height, preview.Thumbnail.Height)
s.Require().Equal(expected.Thumbnail.URL, preview.Thumbnail.URL)
s.assertContainsLongString(expected.Thumbnail.DataURI, preview.Thumbnail.DataURI, 100)
}

View File

@ -28,7 +28,6 @@ import (
"github.com/status-im/status-go/protocol/communities/token"
"github.com/status-im/status-go/protocol/discord"
"github.com/status-im/status-go/protocol/encryption/multidevice"
"github.com/status-im/status-go/protocol/linkpreview"
"github.com/status-im/status-go/protocol/protobuf"
"github.com/status-im/status-go/protocol/pushnotificationclient"
"github.com/status-im/status-go/protocol/requests"
@ -1150,7 +1149,7 @@ func (api *PublicAPI) GetLinkPreviewData(link string) (previewData urls.LinkPrev
// GetTextURLs parses text and returns a deduplicated and (somewhat) normalized
// slice of URLs. The returned URLs can be used as cache keys by clients.
func (api *PublicAPI) GetTextURLs(text string) []string {
return linkpreview.GetURLs(text)
return protocol.GetURLs(text)
}
// UnfurlURLs uses a best-effort approach to unfurl each URL. Failed URLs will
@ -1158,7 +1157,7 @@ func (api *PublicAPI) GetTextURLs(text string) []string {
//
// This endpoint expects the client to send URLs normalized by GetTextURLs.
func (api *PublicAPI) UnfurlURLs(urls []string) ([]common.LinkPreview, error) {
return api.service.messenger.UnfurlURLs(urls)
return api.service.messenger.UnfurlURLs(nil, urls)
}
func (api *PublicAPI) EnsVerified(pk, ensName string) error {