package linkpreview import ( "context" "errors" "fmt" "io/ioutil" "net/http" neturl "net/url" "strings" "time" "github.com/keighl/metabolize" "go.uber.org/zap" "golang.org/x/net/publicsuffix" "github.com/status-im/markdown" "github.com/status-im/status-go/images" "github.com/status-im/status-go/protocol/common" ) // UnfurlError means a non-critical error, and that processing of the preview // should be interrupted and the preview probably ignored. type UnfurlError struct { msg string url string err error } func (ue UnfurlError) Error() string { return fmt.Sprintf("%s, url='%s'", ue.msg, ue.url) } func (ue UnfurlError) Unwrap() error { return ue.err } type LinkPreview struct { common.LinkPreview } type Unfurler interface { unfurl(*neturl.URL) (common.LinkPreview, error) } const ( requestTimeout = 15000 * time.Millisecond // Certain websites return an HTML error page if the user agent is unknown to // them, e.g. IMDb. defaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0" // Currently set to English, but we could make this setting dynamic according // to the user's language of choice. defaultAcceptLanguage = "en-US,en;q=0.5" ) var ( httpClient = http.Client{Timeout: requestTimeout} ) func fetchResponseBody(logger *zap.Logger, url string) ([]byte, error) { ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, err } res, err := httpClient.Do(req) if err != nil { return nil, err } defer func() { if err = res.Body.Close(); err != nil { logger.Error("Failed to close response body", zap.Error(err)) } }() if res.StatusCode >= http.StatusBadRequest { return nil, errors.New(http.StatusText(res.StatusCode)) } bodyBytes, err := ioutil.ReadAll(res.Body) if err != nil { return nil, err } return bodyBytes, nil } func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview { return common.LinkPreview{ URL: url.String(), Hostname: url.Hostname(), } } func httpGETForOpenGraph(url string) (*http.Response, context.CancelFunc, error) { ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, cancel, err } req.Header.Set("User-Agent", defaultUserAgent) req.Header.Set("Accept-Language", defaultAcceptLanguage) res, err := httpClient.Do(req) return res, cancel, err } func fetchThumbnail(logger *zap.Logger, url string) (common.LinkPreviewThumbnail, error) { var thumbnail common.LinkPreviewThumbnail imgBytes, err := fetchResponseBody(logger, url) if err != nil { return thumbnail, fmt.Errorf("could not fetch thumbnail: %w", err) } width, height, err := images.GetImageDimensions(imgBytes) if err != nil { return thumbnail, fmt.Errorf("could not get image dimensions: %w", err) } thumbnail.Width = width thumbnail.Height = height dataURI, err := images.GetPayloadDataURI(imgBytes) if err != nil { return thumbnail, fmt.Errorf("could not build data URI: %w", err) } thumbnail.DataURI = dataURI return thumbnail, nil } type OpenGraphMetadata struct { Title string `json:"title" meta:"og:title"` Description string `json:"description" meta:"og:description"` ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"` } // OpenGraphUnfurler can be used either as the default unfurler for some websites // (e.g. GitHub), or as a fallback strategy. It parses HTML and extract // OpenGraph meta tags. If an oEmbed endpoint is available, it should be // preferred. type OpenGraphUnfurler struct { logger *zap.Logger } func (u OpenGraphUnfurler) unfurl(url *neturl.URL) (common.LinkPreview, error) { preview := newDefaultLinkPreview(url) res, cancel, err := httpGETForOpenGraph(url.String()) defer cancel() defer func() { if res != nil { if err = res.Body.Close(); err != nil { u.logger.Error("failed to close response body", zap.Error(err)) } } }() if err != nil { return preview, UnfurlError{ msg: "failed to get HTML page", url: url.String(), err: err, } } // Behave like WhatsApp, i.e. if the response is a 404, consider the URL // unfurleable. We can try to unfurl from the 404 HTML, which works well for // certain websites, like GitHub, but it also potentially confuses users // because they'll be sharing previews that don't match the actual URLs. if res.StatusCode == http.StatusNotFound { return preview, UnfurlError{ msg: "could not find page", url: url.String(), err: errors.New(""), } } var ogMetadata OpenGraphMetadata err = metabolize.Metabolize(res.Body, &ogMetadata) if err != nil { return preview, UnfurlError{ msg: "failed to parse OpenGraph data", url: url.String(), err: err, } } // There are URLs like https://wikipedia.org/ that don't have an OpenGraph // title tag, but article pages do. In the future, we can fallback to the // website's title by using the