status-go/protocol/linkpreview/linkpreview.go
Icaro Motta 92b5d831fe
Support unfurling more websites (#3530)
Add support for unfurling a wider range of websites. Most code changes are
related to the implementation of a new Unfurler, an OEmbedUnfurler, which is
necessary to get metadata for Reddit URLs using oEmbed, since Reddit does not
support OpenGraph meta tags. The new unfurler will also be useful for other
websites, like Twitter. Also the user agent was changed, and now more websites
consider status-go reasonably human.

Related to issue https://github.com/status-im/status-mobile/issues/15918

Example hostnames that are now unfurleable: reddit.com, open.spotify.com,
music.youtube.com

Other improvements:

- Better error handling, especially because I wasn't wrapping errors correctly.
  I also removed the unnecessary custom error UnfurlErr.
- I made tests truly deterministic by parameterizing the http.Client instance
  and by customizing its Transport field (except for some failing conditions
  where it's even good to hit the real servers).
2023-06-05 07:46:17 -03:00

373 lines
10 KiB
Go

package linkpreview
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
neturl "net/url"
"regexp"
"strings"
"time"
"github.com/keighl/metabolize"
"go.uber.org/zap"
"golang.org/x/net/publicsuffix"
"github.com/status-im/markdown"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/protocol/common"
)
type LinkPreview struct {
common.LinkPreview
}
type Unfurler interface {
unfurl() (common.LinkPreview, error)
}
type Headers map[string]string
const (
defaultRequestTimeout = 15000 * time.Millisecond
headerAcceptJSON = "application/json; charset=utf-8"
headerAcceptText = "text/html; charset=utf-8"
// Without a particular user agent, many providers treat status-go as a
// gluttony bot, and either respond more frequently with a 429 (Too Many
// Requests), or simply refuse to return valid data. Note that using a known
// browser UA doesn't work well with some providers, such as Spotify,
// apparently they still flag status-go as a bad actor.
headerUserAgent = "status-go/v0.151.15"
// Currently set to English, but we could make this setting dynamic according
// to the user's language of choice.
headerAcceptLanguage = "en-US,en;q=0.5"
)
func fetchBody(logger *zap.Logger, httpClient http.Client, url string, headers Headers) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), defaultRequestTimeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to perform HTTP request: %w", err)
}
for k, v := range headers {
req.Header.Set(k, v)
}
res, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer func() {
if err := res.Body.Close(); err != nil {
logger.Error("failed to close response body", zap.Error(err))
}
}()
if res.StatusCode >= http.StatusBadRequest {
return nil, fmt.Errorf("http request failed, statusCode='%d'", res.StatusCode)
}
bodyBytes, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to read body bytes: %w", err)
}
return bodyBytes, nil
}
func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview {
return common.LinkPreview{
URL: url.String(),
Hostname: url.Hostname(),
}
}
func fetchThumbnail(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreviewThumbnail, error) {
var thumbnail common.LinkPreviewThumbnail
imgBytes, err := fetchBody(logger, httpClient, url, nil)
if err != nil {
return thumbnail, fmt.Errorf("could not fetch thumbnail: %w", err)
}
width, height, err := images.GetImageDimensions(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not get image dimensions: %w", err)
}
thumbnail.Width = width
thumbnail.Height = height
dataURI, err := images.GetPayloadDataURI(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not build data URI: %w", err)
}
thumbnail.DataURI = dataURI
return thumbnail, nil
}
type OEmbedUnfurler struct {
logger *zap.Logger
httpClient http.Client
// oembedEndpoint describes where the consumer may request representations for
// the supported URL scheme. For example, for YouTube, it is
// https://www.youtube.com/oembed.
oembedEndpoint string
// url is the actual URL to be unfurled.
url *neturl.URL
}
type OEmbedResponse struct {
Title string `json:"title"`
ThumbnailURL string `json:"thumbnail_url"`
}
func (u OEmbedUnfurler) newOEmbedURL() (*neturl.URL, error) {
oembedURL, err := neturl.Parse(u.oembedEndpoint)
if err != nil {
return nil, err
}
// When format is specified, the provider MUST return data in the requested
// format, else return an error.
oembedURL.RawQuery = neturl.Values{
"url": {u.url.String()},
"format": {"json"},
}.Encode()
return oembedURL, nil
}
func (u OEmbedUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
oembedURL, err := u.newOEmbedURL()
if err != nil {
return preview, err
}
headers := map[string]string{
"accept": headerAcceptJSON,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
oembedBytes, err := fetchBody(u.logger, u.httpClient, oembedURL.String(), headers)
if err != nil {
return preview, err
}
var oembedResponse OEmbedResponse
if err != nil {
return preview, err
}
err = json.Unmarshal(oembedBytes, &oembedResponse)
if err != nil {
return preview, err
}
if oembedResponse.Title == "" {
return preview, fmt.Errorf("missing required title in oEmbed response")
}
preview.Title = oembedResponse.Title
return preview, nil
}
type OpenGraphMetadata struct {
Title string `json:"title" meta:"og:title"`
Description string `json:"description" meta:"og:description"`
ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"`
}
// OpenGraphUnfurler should be preferred over OEmbedUnfurler because oEmbed
// gives back a JSON response with a "html" field that's supposed to be embedded
// in an iframe (hardly useful for existing Status' clients).
type OpenGraphUnfurler struct {
url *neturl.URL
logger *zap.Logger
httpClient http.Client
}
func (u OpenGraphUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
headers := map[string]string{
"accept": headerAcceptText,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
bodyBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
if err != nil {
return preview, err
}
var ogMetadata OpenGraphMetadata
err = metabolize.Metabolize(ioutil.NopCloser(bytes.NewBuffer(bodyBytes)), &ogMetadata)
if err != nil {
return preview, fmt.Errorf("failed to parse OpenGraph data")
}
// There are URLs like https://wikipedia.org/ that don't have an OpenGraph
// title tag, but article pages do. In the future, we can fallback to the
// website's title by using the <title> tag.
if ogMetadata.Title == "" {
return preview, fmt.Errorf("missing required title in OpenGraph response")
}
if ogMetadata.ThumbnailURL != "" {
t, err := fetchThumbnail(u.logger, u.httpClient, ogMetadata.ThumbnailURL)
if err != nil {
// Given we want to fetch thumbnails on a best-effort basis, if an error
// happens we simply log it.
u.logger.Info("failed to fetch thumbnail", zap.String("url", u.url.String()), zap.Error(err))
} else {
preview.Thumbnail = t
}
}
preview.Title = ogMetadata.Title
preview.Description = ogMetadata.Description
return preview, nil
}
func normalizeHostname(hostname string) string {
hostname = strings.ToLower(hostname)
re := regexp.MustCompile(`^www\.(.*)$`)
return re.ReplaceAllString(hostname, "$1")
}
func newUnfurler(logger *zap.Logger, httpClient http.Client, url *neturl.URL) Unfurler {
switch normalizeHostname(url.Hostname()) {
case "reddit.com":
return OEmbedUnfurler{
oembedEndpoint: "https://www.reddit.com/oembed",
url: url,
logger: logger,
httpClient: httpClient,
}
default:
return OpenGraphUnfurler{
url: url,
logger: logger,
httpClient: httpClient,
}
}
}
func unfurl(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreview, error) {
var preview common.LinkPreview
parsedURL, err := neturl.Parse(url)
if err != nil {
return preview, err
}
unfurler := newUnfurler(logger, httpClient, parsedURL)
preview, err = unfurler.unfurl()
if err != nil {
return preview, err
}
preview.Hostname = strings.ToLower(parsedURL.Hostname())
return preview, nil
}
// parseValidURL is a stricter version of url.Parse that performs additional
// checks to ensure the URL is valid for clients to request a link preview.
func parseValidURL(rawURL string) (*neturl.URL, error) {
u, err := neturl.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("parsing URL failed: %w", err)
}
if u.Scheme == "" {
return nil, errors.New("missing URL scheme")
}
_, err = publicsuffix.EffectiveTLDPlusOne(u.Hostname())
if err != nil {
return nil, fmt.Errorf("missing known URL domain: %w", err)
}
return u, nil
}
// GetURLs returns only what we consider unfurleable URLs.
//
// If we wanted to be extra precise and help improve UX, we could ignore URLs
// that we know can't be unfurled. This is at least possible with the oEmbed
// protocol because providers must specify an endpoint scheme.
func GetURLs(text string) []string {
parsedText := markdown.Parse([]byte(text), nil)
visitor := common.RunLinksVisitor(parsedText)
urls := make([]string, 0, len(visitor.Links))
indexed := make(map[string]any, len(visitor.Links))
for _, rawURL := range visitor.Links {
parsedURL, err := parseValidURL(rawURL)
if err != nil {
continue
}
// Lowercase the host so the URL can be used as a cache key. Particularly on
// mobile clients it is common that the first character in a text input is
// automatically uppercased. In WhatsApp they incorrectly lowercase the
// URL's path, but this is incorrect. For instance, some URL shorteners are
// case-sensitive, some websites encode base64 in the path, etc.
parsedURL.Host = strings.ToLower(parsedURL.Host)
idx := parsedURL.String()
// Removes the spurious trailing forward slash.
idx = strings.TrimRight(idx, "/")
if _, exists := indexed[idx]; exists {
continue
} else {
indexed[idx] = nil
urls = append(urls, idx)
}
}
return urls
}
func NewDefaultHTTPClient() http.Client {
return http.Client{Timeout: defaultRequestTimeout}
}
// UnfurlURLs assumes clients pass URLs verbatim that were validated and
// processed by GetURLs.
func UnfurlURLs(logger *zap.Logger, httpClient http.Client, urls []string) ([]common.LinkPreview, error) {
var err error
if logger == nil {
logger, err = zap.NewDevelopment()
if err != nil {
return nil, fmt.Errorf("failed to create logger: %w", err)
}
}
previews := make([]common.LinkPreview, 0, len(urls))
for _, url := range urls {
logger.Debug("unfurling", zap.String("url", url))
p, err := unfurl(logger, httpClient, url)
if err != nil {
logger.Info("failed to unfurl", zap.String("url", url), zap.Error(err))
continue
}
previews = append(previews, p)
}
return previews, nil
}