mirror of
https://github.com/status-im/status-go.git
synced 2025-01-21 20:20:29 +00:00
92b5d831fe
Add support for unfurling a wider range of websites. Most code changes are related to the implementation of a new Unfurler, an OEmbedUnfurler, which is necessary to get metadata for Reddit URLs using oEmbed, since Reddit does not support OpenGraph meta tags. The new unfurler will also be useful for other websites, like Twitter. Also the user agent was changed, and now more websites consider status-go reasonably human. Related to issue https://github.com/status-im/status-mobile/issues/15918 Example hostnames that are now unfurleable: reddit.com, open.spotify.com, music.youtube.com Other improvements: - Better error handling, especially because I wasn't wrapping errors correctly. I also removed the unnecessary custom error UnfurlErr. - I made tests truly deterministic by parameterizing the http.Client instance and by customizing its Transport field (except for some failing conditions where it's even good to hit the real servers).
373 lines
10 KiB
Go
373 lines
10 KiB
Go
package linkpreview
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net/http"
|
|
neturl "net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/keighl/metabolize"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/net/publicsuffix"
|
|
|
|
"github.com/status-im/markdown"
|
|
|
|
"github.com/status-im/status-go/images"
|
|
"github.com/status-im/status-go/protocol/common"
|
|
)
|
|
|
|
type LinkPreview struct {
|
|
common.LinkPreview
|
|
}
|
|
|
|
type Unfurler interface {
|
|
unfurl() (common.LinkPreview, error)
|
|
}
|
|
|
|
type Headers map[string]string
|
|
|
|
const (
|
|
defaultRequestTimeout = 15000 * time.Millisecond
|
|
|
|
headerAcceptJSON = "application/json; charset=utf-8"
|
|
headerAcceptText = "text/html; charset=utf-8"
|
|
|
|
// Without a particular user agent, many providers treat status-go as a
|
|
// gluttony bot, and either respond more frequently with a 429 (Too Many
|
|
// Requests), or simply refuse to return valid data. Note that using a known
|
|
// browser UA doesn't work well with some providers, such as Spotify,
|
|
// apparently they still flag status-go as a bad actor.
|
|
headerUserAgent = "status-go/v0.151.15"
|
|
|
|
// Currently set to English, but we could make this setting dynamic according
|
|
// to the user's language of choice.
|
|
headerAcceptLanguage = "en-US,en;q=0.5"
|
|
)
|
|
|
|
func fetchBody(logger *zap.Logger, httpClient http.Client, url string, headers Headers) ([]byte, error) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), defaultRequestTimeout)
|
|
defer cancel()
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to perform HTTP request: %w", err)
|
|
}
|
|
|
|
for k, v := range headers {
|
|
req.Header.Set(k, v)
|
|
}
|
|
|
|
res, err := httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err := res.Body.Close(); err != nil {
|
|
logger.Error("failed to close response body", zap.Error(err))
|
|
}
|
|
}()
|
|
|
|
if res.StatusCode >= http.StatusBadRequest {
|
|
return nil, fmt.Errorf("http request failed, statusCode='%d'", res.StatusCode)
|
|
}
|
|
|
|
bodyBytes, err := ioutil.ReadAll(res.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read body bytes: %w", err)
|
|
}
|
|
|
|
return bodyBytes, nil
|
|
}
|
|
|
|
func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview {
|
|
return common.LinkPreview{
|
|
URL: url.String(),
|
|
Hostname: url.Hostname(),
|
|
}
|
|
}
|
|
|
|
func fetchThumbnail(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreviewThumbnail, error) {
|
|
var thumbnail common.LinkPreviewThumbnail
|
|
|
|
imgBytes, err := fetchBody(logger, httpClient, url, nil)
|
|
if err != nil {
|
|
return thumbnail, fmt.Errorf("could not fetch thumbnail: %w", err)
|
|
}
|
|
|
|
width, height, err := images.GetImageDimensions(imgBytes)
|
|
if err != nil {
|
|
return thumbnail, fmt.Errorf("could not get image dimensions: %w", err)
|
|
}
|
|
thumbnail.Width = width
|
|
thumbnail.Height = height
|
|
|
|
dataURI, err := images.GetPayloadDataURI(imgBytes)
|
|
if err != nil {
|
|
return thumbnail, fmt.Errorf("could not build data URI: %w", err)
|
|
}
|
|
thumbnail.DataURI = dataURI
|
|
|
|
return thumbnail, nil
|
|
}
|
|
|
|
type OEmbedUnfurler struct {
|
|
logger *zap.Logger
|
|
httpClient http.Client
|
|
// oembedEndpoint describes where the consumer may request representations for
|
|
// the supported URL scheme. For example, for YouTube, it is
|
|
// https://www.youtube.com/oembed.
|
|
oembedEndpoint string
|
|
// url is the actual URL to be unfurled.
|
|
url *neturl.URL
|
|
}
|
|
|
|
type OEmbedResponse struct {
|
|
Title string `json:"title"`
|
|
ThumbnailURL string `json:"thumbnail_url"`
|
|
}
|
|
|
|
func (u OEmbedUnfurler) newOEmbedURL() (*neturl.URL, error) {
|
|
oembedURL, err := neturl.Parse(u.oembedEndpoint)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// When format is specified, the provider MUST return data in the requested
|
|
// format, else return an error.
|
|
oembedURL.RawQuery = neturl.Values{
|
|
"url": {u.url.String()},
|
|
"format": {"json"},
|
|
}.Encode()
|
|
|
|
return oembedURL, nil
|
|
}
|
|
|
|
func (u OEmbedUnfurler) unfurl() (common.LinkPreview, error) {
|
|
preview := newDefaultLinkPreview(u.url)
|
|
|
|
oembedURL, err := u.newOEmbedURL()
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
|
|
headers := map[string]string{
|
|
"accept": headerAcceptJSON,
|
|
"accept-language": headerAcceptLanguage,
|
|
"user-agent": headerUserAgent,
|
|
}
|
|
oembedBytes, err := fetchBody(u.logger, u.httpClient, oembedURL.String(), headers)
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
|
|
var oembedResponse OEmbedResponse
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
err = json.Unmarshal(oembedBytes, &oembedResponse)
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
|
|
if oembedResponse.Title == "" {
|
|
return preview, fmt.Errorf("missing required title in oEmbed response")
|
|
}
|
|
|
|
preview.Title = oembedResponse.Title
|
|
return preview, nil
|
|
}
|
|
|
|
type OpenGraphMetadata struct {
|
|
Title string `json:"title" meta:"og:title"`
|
|
Description string `json:"description" meta:"og:description"`
|
|
ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"`
|
|
}
|
|
|
|
// OpenGraphUnfurler should be preferred over OEmbedUnfurler because oEmbed
|
|
// gives back a JSON response with a "html" field that's supposed to be embedded
|
|
// in an iframe (hardly useful for existing Status' clients).
|
|
type OpenGraphUnfurler struct {
|
|
url *neturl.URL
|
|
logger *zap.Logger
|
|
httpClient http.Client
|
|
}
|
|
|
|
func (u OpenGraphUnfurler) unfurl() (common.LinkPreview, error) {
|
|
preview := newDefaultLinkPreview(u.url)
|
|
|
|
headers := map[string]string{
|
|
"accept": headerAcceptText,
|
|
"accept-language": headerAcceptLanguage,
|
|
"user-agent": headerUserAgent,
|
|
}
|
|
bodyBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
|
|
var ogMetadata OpenGraphMetadata
|
|
err = metabolize.Metabolize(ioutil.NopCloser(bytes.NewBuffer(bodyBytes)), &ogMetadata)
|
|
if err != nil {
|
|
return preview, fmt.Errorf("failed to parse OpenGraph data")
|
|
}
|
|
|
|
// There are URLs like https://wikipedia.org/ that don't have an OpenGraph
|
|
// title tag, but article pages do. In the future, we can fallback to the
|
|
// website's title by using the <title> tag.
|
|
if ogMetadata.Title == "" {
|
|
return preview, fmt.Errorf("missing required title in OpenGraph response")
|
|
}
|
|
|
|
if ogMetadata.ThumbnailURL != "" {
|
|
t, err := fetchThumbnail(u.logger, u.httpClient, ogMetadata.ThumbnailURL)
|
|
if err != nil {
|
|
// Given we want to fetch thumbnails on a best-effort basis, if an error
|
|
// happens we simply log it.
|
|
u.logger.Info("failed to fetch thumbnail", zap.String("url", u.url.String()), zap.Error(err))
|
|
} else {
|
|
preview.Thumbnail = t
|
|
}
|
|
}
|
|
|
|
preview.Title = ogMetadata.Title
|
|
preview.Description = ogMetadata.Description
|
|
return preview, nil
|
|
}
|
|
|
|
func normalizeHostname(hostname string) string {
|
|
hostname = strings.ToLower(hostname)
|
|
re := regexp.MustCompile(`^www\.(.*)$`)
|
|
return re.ReplaceAllString(hostname, "$1")
|
|
}
|
|
|
|
func newUnfurler(logger *zap.Logger, httpClient http.Client, url *neturl.URL) Unfurler {
|
|
switch normalizeHostname(url.Hostname()) {
|
|
case "reddit.com":
|
|
return OEmbedUnfurler{
|
|
oembedEndpoint: "https://www.reddit.com/oembed",
|
|
url: url,
|
|
logger: logger,
|
|
httpClient: httpClient,
|
|
}
|
|
default:
|
|
return OpenGraphUnfurler{
|
|
url: url,
|
|
logger: logger,
|
|
httpClient: httpClient,
|
|
}
|
|
}
|
|
}
|
|
|
|
func unfurl(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreview, error) {
|
|
var preview common.LinkPreview
|
|
|
|
parsedURL, err := neturl.Parse(url)
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
|
|
unfurler := newUnfurler(logger, httpClient, parsedURL)
|
|
preview, err = unfurler.unfurl()
|
|
if err != nil {
|
|
return preview, err
|
|
}
|
|
preview.Hostname = strings.ToLower(parsedURL.Hostname())
|
|
|
|
return preview, nil
|
|
}
|
|
|
|
// parseValidURL is a stricter version of url.Parse that performs additional
|
|
// checks to ensure the URL is valid for clients to request a link preview.
|
|
func parseValidURL(rawURL string) (*neturl.URL, error) {
|
|
u, err := neturl.Parse(rawURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parsing URL failed: %w", err)
|
|
}
|
|
|
|
if u.Scheme == "" {
|
|
return nil, errors.New("missing URL scheme")
|
|
}
|
|
|
|
_, err = publicsuffix.EffectiveTLDPlusOne(u.Hostname())
|
|
if err != nil {
|
|
return nil, fmt.Errorf("missing known URL domain: %w", err)
|
|
}
|
|
|
|
return u, nil
|
|
}
|
|
|
|
// GetURLs returns only what we consider unfurleable URLs.
|
|
//
|
|
// If we wanted to be extra precise and help improve UX, we could ignore URLs
|
|
// that we know can't be unfurled. This is at least possible with the oEmbed
|
|
// protocol because providers must specify an endpoint scheme.
|
|
func GetURLs(text string) []string {
|
|
parsedText := markdown.Parse([]byte(text), nil)
|
|
visitor := common.RunLinksVisitor(parsedText)
|
|
|
|
urls := make([]string, 0, len(visitor.Links))
|
|
indexed := make(map[string]any, len(visitor.Links))
|
|
|
|
for _, rawURL := range visitor.Links {
|
|
parsedURL, err := parseValidURL(rawURL)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// Lowercase the host so the URL can be used as a cache key. Particularly on
|
|
// mobile clients it is common that the first character in a text input is
|
|
// automatically uppercased. In WhatsApp they incorrectly lowercase the
|
|
// URL's path, but this is incorrect. For instance, some URL shorteners are
|
|
// case-sensitive, some websites encode base64 in the path, etc.
|
|
parsedURL.Host = strings.ToLower(parsedURL.Host)
|
|
|
|
idx := parsedURL.String()
|
|
// Removes the spurious trailing forward slash.
|
|
idx = strings.TrimRight(idx, "/")
|
|
if _, exists := indexed[idx]; exists {
|
|
continue
|
|
} else {
|
|
indexed[idx] = nil
|
|
urls = append(urls, idx)
|
|
}
|
|
}
|
|
|
|
return urls
|
|
}
|
|
|
|
func NewDefaultHTTPClient() http.Client {
|
|
return http.Client{Timeout: defaultRequestTimeout}
|
|
}
|
|
|
|
// UnfurlURLs assumes clients pass URLs verbatim that were validated and
|
|
// processed by GetURLs.
|
|
func UnfurlURLs(logger *zap.Logger, httpClient http.Client, urls []string) ([]common.LinkPreview, error) {
|
|
var err error
|
|
if logger == nil {
|
|
logger, err = zap.NewDevelopment()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create logger: %w", err)
|
|
}
|
|
}
|
|
|
|
previews := make([]common.LinkPreview, 0, len(urls))
|
|
|
|
for _, url := range urls {
|
|
logger.Debug("unfurling", zap.String("url", url))
|
|
p, err := unfurl(logger, httpClient, url)
|
|
if err != nil {
|
|
logger.Info("failed to unfurl", zap.String("url", url), zap.Error(err))
|
|
continue
|
|
}
|
|
previews = append(previews, p)
|
|
}
|
|
|
|
return previews, nil
|
|
}
|