status-go/protocol/linkpreview/linkpreview.go

373 lines
10 KiB
Go

package linkpreview
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
neturl "net/url"
"regexp"
"strings"
"time"
"github.com/keighl/metabolize"
"go.uber.org/zap"
"golang.org/x/net/publicsuffix"
"github.com/status-im/markdown"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/protocol/common"
)
type LinkPreview struct {
common.LinkPreview
}
type Unfurler interface {
unfurl() (common.LinkPreview, error)
}
type Headers map[string]string
const (
defaultRequestTimeout = 15000 * time.Millisecond
headerAcceptJSON = "application/json; charset=utf-8"
headerAcceptText = "text/html; charset=utf-8"
// Without a particular user agent, many providers treat status-go as a
// gluttony bot, and either respond more frequently with a 429 (Too Many
// Requests), or simply refuse to return valid data. Note that using a known
// browser UA doesn't work well with some providers, such as Spotify,
// apparently they still flag status-go as a bad actor.
headerUserAgent = "status-go/v0.151.15"
// Currently set to English, but we could make this setting dynamic according
// to the user's language of choice.
headerAcceptLanguage = "en-US,en;q=0.5"
)
func fetchBody(logger *zap.Logger, httpClient http.Client, url string, headers Headers) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), defaultRequestTimeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to perform HTTP request: %w", err)
}
for k, v := range headers {
req.Header.Set(k, v)
}
res, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer func() {
if err := res.Body.Close(); err != nil {
logger.Error("failed to close response body", zap.Error(err))
}
}()
if res.StatusCode >= http.StatusBadRequest {
return nil, fmt.Errorf("http request failed, statusCode='%d'", res.StatusCode)
}
bodyBytes, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to read body bytes: %w", err)
}
return bodyBytes, nil
}
func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview {
return common.LinkPreview{
URL: url.String(),
Hostname: url.Hostname(),
}
}
func fetchThumbnail(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreviewThumbnail, error) {
var thumbnail common.LinkPreviewThumbnail
imgBytes, err := fetchBody(logger, httpClient, url, nil)
if err != nil {
return thumbnail, fmt.Errorf("could not fetch thumbnail: %w", err)
}
width, height, err := images.GetImageDimensions(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not get image dimensions: %w", err)
}
thumbnail.Width = width
thumbnail.Height = height
dataURI, err := images.GetPayloadDataURI(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not build data URI: %w", err)
}
thumbnail.DataURI = dataURI
return thumbnail, nil
}
type OEmbedUnfurler struct {
logger *zap.Logger
httpClient http.Client
// oembedEndpoint describes where the consumer may request representations for
// the supported URL scheme. For example, for YouTube, it is
// https://www.youtube.com/oembed.
oembedEndpoint string
// url is the actual URL to be unfurled.
url *neturl.URL
}
type OEmbedResponse struct {
Title string `json:"title"`
ThumbnailURL string `json:"thumbnail_url"`
}
func (u OEmbedUnfurler) newOEmbedURL() (*neturl.URL, error) {
oembedURL, err := neturl.Parse(u.oembedEndpoint)
if err != nil {
return nil, err
}
// When format is specified, the provider MUST return data in the requested
// format, else return an error.
oembedURL.RawQuery = neturl.Values{
"url": {u.url.String()},
"format": {"json"},
}.Encode()
return oembedURL, nil
}
func (u OEmbedUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
oembedURL, err := u.newOEmbedURL()
if err != nil {
return preview, err
}
headers := map[string]string{
"accept": headerAcceptJSON,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
oembedBytes, err := fetchBody(u.logger, u.httpClient, oembedURL.String(), headers)
if err != nil {
return preview, err
}
var oembedResponse OEmbedResponse
if err != nil {
return preview, err
}
err = json.Unmarshal(oembedBytes, &oembedResponse)
if err != nil {
return preview, err
}
if oembedResponse.Title == "" {
return preview, fmt.Errorf("missing required title in oEmbed response")
}
preview.Title = oembedResponse.Title
return preview, nil
}
type OpenGraphMetadata struct {
Title string `json:"title" meta:"og:title"`
Description string `json:"description" meta:"og:description"`
ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"`
}
// OpenGraphUnfurler should be preferred over OEmbedUnfurler because oEmbed
// gives back a JSON response with a "html" field that's supposed to be embedded
// in an iframe (hardly useful for existing Status' clients).
type OpenGraphUnfurler struct {
url *neturl.URL
logger *zap.Logger
httpClient http.Client
}
func (u OpenGraphUnfurler) unfurl() (common.LinkPreview, error) {
preview := newDefaultLinkPreview(u.url)
headers := map[string]string{
"accept": headerAcceptText,
"accept-language": headerAcceptLanguage,
"user-agent": headerUserAgent,
}
bodyBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers)
if err != nil {
return preview, err
}
var ogMetadata OpenGraphMetadata
err = metabolize.Metabolize(ioutil.NopCloser(bytes.NewBuffer(bodyBytes)), &ogMetadata)
if err != nil {
return preview, fmt.Errorf("failed to parse OpenGraph data")
}
// There are URLs like https://wikipedia.org/ that don't have an OpenGraph
// title tag, but article pages do. In the future, we can fallback to the
// website's title by using the <title> tag.
if ogMetadata.Title == "" {
return preview, fmt.Errorf("missing required title in OpenGraph response")
}
if ogMetadata.ThumbnailURL != "" {
t, err := fetchThumbnail(u.logger, u.httpClient, ogMetadata.ThumbnailURL)
if err != nil {
// Given we want to fetch thumbnails on a best-effort basis, if an error
// happens we simply log it.
u.logger.Info("failed to fetch thumbnail", zap.String("url", u.url.String()), zap.Error(err))
} else {
preview.Thumbnail = t
}
}
preview.Title = ogMetadata.Title
preview.Description = ogMetadata.Description
return preview, nil
}
func normalizeHostname(hostname string) string {
hostname = strings.ToLower(hostname)
re := regexp.MustCompile(`^www\.(.*)$`)
return re.ReplaceAllString(hostname, "$1")
}
func newUnfurler(logger *zap.Logger, httpClient http.Client, url *neturl.URL) Unfurler {
switch normalizeHostname(url.Hostname()) {
case "reddit.com":
return OEmbedUnfurler{
oembedEndpoint: "https://www.reddit.com/oembed",
url: url,
logger: logger,
httpClient: httpClient,
}
default:
return OpenGraphUnfurler{
url: url,
logger: logger,
httpClient: httpClient,
}
}
}
func unfurl(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreview, error) {
var preview common.LinkPreview
parsedURL, err := neturl.Parse(url)
if err != nil {
return preview, err
}
unfurler := newUnfurler(logger, httpClient, parsedURL)
preview, err = unfurler.unfurl()
if err != nil {
return preview, err
}
preview.Hostname = strings.ToLower(parsedURL.Hostname())
return preview, nil
}
// parseValidURL is a stricter version of url.Parse that performs additional
// checks to ensure the URL is valid for clients to request a link preview.
func parseValidURL(rawURL string) (*neturl.URL, error) {
u, err := neturl.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("parsing URL failed: %w", err)
}
if u.Scheme == "" {
return nil, errors.New("missing URL scheme")
}
_, err = publicsuffix.EffectiveTLDPlusOne(u.Hostname())
if err != nil {
return nil, fmt.Errorf("missing known URL domain: %w", err)
}
return u, nil
}
// GetURLs returns only what we consider unfurleable URLs.
//
// If we wanted to be extra precise and help improve UX, we could ignore URLs
// that we know can't be unfurled. This is at least possible with the oEmbed
// protocol because providers must specify an endpoint scheme.
func GetURLs(text string) []string {
parsedText := markdown.Parse([]byte(text), nil)
visitor := common.RunLinksVisitor(parsedText)
urls := make([]string, 0, len(visitor.Links))
indexed := make(map[string]any, len(visitor.Links))
for _, rawURL := range visitor.Links {
parsedURL, err := parseValidURL(rawURL)
if err != nil {
continue
}
// Lowercase the host so the URL can be used as a cache key. Particularly on
// mobile clients it is common that the first character in a text input is
// automatically uppercased. In WhatsApp they incorrectly lowercase the
// URL's path, but this is incorrect. For instance, some URL shorteners are
// case-sensitive, some websites encode base64 in the path, etc.
parsedURL.Host = strings.ToLower(parsedURL.Host)
idx := parsedURL.String()
// Removes the spurious trailing forward slash.
idx = strings.TrimRight(idx, "/")
if _, exists := indexed[idx]; exists {
continue
} else {
indexed[idx] = nil
urls = append(urls, idx)
}
}
return urls
}
func NewDefaultHTTPClient() http.Client {
return http.Client{Timeout: defaultRequestTimeout}
}
// UnfurlURLs assumes clients pass URLs verbatim that were validated and
// processed by GetURLs.
func UnfurlURLs(logger *zap.Logger, httpClient http.Client, urls []string) ([]common.LinkPreview, error) {
var err error
if logger == nil {
logger, err = zap.NewDevelopment()
if err != nil {
return nil, fmt.Errorf("failed to create logger: %w", err)
}
}
previews := make([]common.LinkPreview, 0, len(urls))
for _, url := range urls {
logger.Debug("unfurling", zap.String("url", url))
p, err := unfurl(logger, httpClient, url)
if err != nil {
logger.Info("failed to unfurl", zap.String("url", url), zap.Error(err))
continue
}
previews = append(previews, p)
}
return previews, nil
}