status-go/protocol/linkpreview/linkpreview.go

328 lines
8.3 KiB
Go

package linkpreview
import (
"context"
"errors"
"fmt"
"io/ioutil"
"net/http"
neturl "net/url"
"strings"
"time"
"github.com/keighl/metabolize"
"go.uber.org/zap"
"golang.org/x/net/publicsuffix"
"github.com/status-im/markdown"
"github.com/status-im/status-go/images"
"github.com/status-im/status-go/protocol/common"
)
// UnfurlError means a non-critical error, and that processing of the preview
// should be interrupted and the preview probably ignored.
type UnfurlError struct {
msg string
url string
err error
}
func (ue UnfurlError) Error() string {
return fmt.Sprintf("%s, url='%s'", ue.msg, ue.url)
}
func (ue UnfurlError) Unwrap() error {
return ue.err
}
type LinkPreview struct {
common.LinkPreview
}
type Unfurler interface {
unfurl(*neturl.URL) (common.LinkPreview, error)
}
const (
requestTimeout = 15000 * time.Millisecond
// Certain websites return an HTML error page if the user agent is unknown to
// them, e.g. IMDb.
defaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"
// Currently set to English, but we could make this setting dynamic according
// to the user's language of choice.
defaultAcceptLanguage = "en-US,en;q=0.5"
)
var (
httpClient = http.Client{Timeout: requestTimeout}
)
func fetchResponseBody(logger *zap.Logger, url string) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
res, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer func() {
if err = res.Body.Close(); err != nil {
logger.Error("Failed to close response body", zap.Error(err))
}
}()
if res.StatusCode >= http.StatusBadRequest {
return nil, errors.New(http.StatusText(res.StatusCode))
}
bodyBytes, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, err
}
return bodyBytes, nil
}
func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview {
return common.LinkPreview{
URL: url.String(),
Hostname: url.Hostname(),
}
}
func httpGETForOpenGraph(url string) (*http.Response, context.CancelFunc, error) {
ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, cancel, err
}
req.Header.Set("User-Agent", defaultUserAgent)
req.Header.Set("Accept-Language", defaultAcceptLanguage)
res, err := httpClient.Do(req)
return res, cancel, err
}
func fetchThumbnail(logger *zap.Logger, url string) (common.LinkPreviewThumbnail, error) {
var thumbnail common.LinkPreviewThumbnail
imgBytes, err := fetchResponseBody(logger, url)
if err != nil {
return thumbnail, fmt.Errorf("could not fetch thumbnail: %w", err)
}
width, height, err := images.GetImageDimensions(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not get image dimensions: %w", err)
}
thumbnail.Width = width
thumbnail.Height = height
dataURI, err := images.GetPayloadDataURI(imgBytes)
if err != nil {
return thumbnail, fmt.Errorf("could not build data URI: %w", err)
}
thumbnail.DataURI = dataURI
return thumbnail, nil
}
type OpenGraphMetadata struct {
Title string `json:"title" meta:"og:title"`
Description string `json:"description" meta:"og:description"`
ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"`
}
// OpenGraphUnfurler can be used either as the default unfurler for some websites
// (e.g. GitHub), or as a fallback strategy. It parses HTML and extract
// OpenGraph meta tags. If an oEmbed endpoint is available, it should be
// preferred.
type OpenGraphUnfurler struct {
logger *zap.Logger
}
func (u OpenGraphUnfurler) unfurl(url *neturl.URL) (common.LinkPreview, error) {
preview := newDefaultLinkPreview(url)
res, cancel, err := httpGETForOpenGraph(url.String())
defer cancel()
defer func() {
if res != nil {
if err = res.Body.Close(); err != nil {
u.logger.Error("failed to close response body", zap.Error(err))
}
}
}()
if err != nil {
return preview, UnfurlError{
msg: "failed to get HTML page",
url: url.String(),
err: err,
}
}
// Behave like WhatsApp, i.e. if the response is a 404, consider the URL
// unfurleable. We can try to unfurl from the 404 HTML, which works well for
// certain websites, like GitHub, but it also potentially confuses users
// because they'll be sharing previews that don't match the actual URLs.
if res.StatusCode == http.StatusNotFound {
return preview, UnfurlError{
msg: "could not find page",
url: url.String(),
err: errors.New(""),
}
}
var ogMetadata OpenGraphMetadata
err = metabolize.Metabolize(res.Body, &ogMetadata)
if err != nil {
return preview, UnfurlError{
msg: "failed to parse OpenGraph data",
url: url.String(),
err: err,
}
}
// There are URLs like https://wikipedia.org/ that don't have an OpenGraph
// title tag, but article pages do. In the future, we can fallback to the
// website's title by using the <title> tag.
if ogMetadata.Title == "" {
return preview, UnfurlError{
msg: "missing title",
url: url.String(),
err: errors.New(""),
}
}
if ogMetadata.ThumbnailURL != "" {
t, err := fetchThumbnail(u.logger, ogMetadata.ThumbnailURL)
if err != nil {
// Given we want to fetch thumbnails on a best-effort basis, if an error
// happens we simply log it.
u.logger.Info("failed to fetch thumbnail", zap.String("url", url.String()), zap.Error(err))
} else {
preview.Thumbnail = t
}
}
preview.Title = ogMetadata.Title
preview.Description = ogMetadata.Description
return preview, nil
}
func newUnfurler(logger *zap.Logger, url *neturl.URL) Unfurler {
u := new(OpenGraphUnfurler)
u.logger = logger
return u
}
func unfurl(logger *zap.Logger, url string) (common.LinkPreview, error) {
var preview common.LinkPreview
parsedURL, err := neturl.Parse(url)
if err != nil {
return preview, err
}
unfurler := newUnfurler(logger, parsedURL)
preview, err = unfurler.unfurl(parsedURL)
if err != nil {
return preview, err
}
preview.Hostname = strings.ToLower(parsedURL.Hostname())
return preview, nil
}
// parseValidURL is a stricter version of url.Parse that performs additional
// checks to ensure the URL is valid for clients to request a link preview.
func parseValidURL(rawURL string) (*neturl.URL, error) {
u, err := neturl.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("parsing URL failed: %w", err)
}
if u.Scheme == "" {
return nil, errors.New("missing URL scheme")
}
_, err = publicsuffix.EffectiveTLDPlusOne(u.Hostname())
if err != nil {
return nil, fmt.Errorf("missing known URL domain: %w", err)
}
return u, nil
}
// GetURLs returns only what we consider unfurleable URLs.
func GetURLs(text string) []string {
parsedText := markdown.Parse([]byte(text), nil)
visitor := common.RunLinksVisitor(parsedText)
urls := make([]string, 0, len(visitor.Links))
indexed := make(map[string]any, len(visitor.Links))
for _, rawURL := range visitor.Links {
parsedURL, err := parseValidURL(rawURL)
if err != nil {
continue
}
// Lowercase the host so the URL can be used as a cache key. Particularly on
// mobile clients it is common that the first character in a text input is
// automatically uppercased. In WhatsApp they incorrectly lowercase the
// URL's path, but this is incorrect. For instance, some URL shorteners are
// case-sensitive, some websites encode base64 in the path, etc.
parsedURL.Host = strings.ToLower(parsedURL.Host)
idx := parsedURL.String()
// Removes the spurious trailing forward slash.
idx = strings.TrimRight(idx, "/")
if _, exists := indexed[idx]; exists {
continue
} else {
indexed[idx] = nil
urls = append(urls, idx)
}
}
return urls
}
// UnfurlURLs assumes clients pass URLs verbatim that were validated and
// processed by GetURLs.
func UnfurlURLs(logger *zap.Logger, urls []string) ([]common.LinkPreview, error) {
var err error
if logger == nil {
logger, err = zap.NewDevelopment()
if err != nil {
return nil, fmt.Errorf("failed to create logger: %w", err)
}
}
previews := make([]common.LinkPreview, 0, len(urls))
for _, url := range urls {
p, err := unfurl(logger, url)
if err != nil {
if unfurlErr, ok := err.(UnfurlError); ok {
logger.Info("failed to unfurl", zap.Error(unfurlErr))
continue
}
return nil, err
}
previews = append(previews, p)
}
return previews, nil
}