package linkpreview import ( "bytes" "context" "encoding/json" "errors" "fmt" "io/ioutil" "net/http" neturl "net/url" "regexp" "strings" "time" "github.com/keighl/metabolize" "go.uber.org/zap" "golang.org/x/net/publicsuffix" "github.com/status-im/markdown" "github.com/status-im/status-go/images" "github.com/status-im/status-go/protocol/common" ) type LinkPreview struct { common.LinkPreview } type Unfurler interface { unfurl() (common.LinkPreview, error) } type Headers map[string]string const ( defaultRequestTimeout = 15000 * time.Millisecond headerAcceptJSON = "application/json; charset=utf-8" headerAcceptText = "text/html; charset=utf-8" // Without a particular user agent, many providers treat status-go as a // gluttony bot, and either respond more frequently with a 429 (Too Many // Requests), or simply refuse to return valid data. Note that using a known // browser UA doesn't work well with some providers, such as Spotify, // apparently they still flag status-go as a bad actor. headerUserAgent = "status-go/v0.151.15" // Currently set to English, but we could make this setting dynamic according // to the user's language of choice. headerAcceptLanguage = "en-US,en;q=0.5" ) func fetchBody(logger *zap.Logger, httpClient http.Client, url string, headers Headers) ([]byte, error) { ctx, cancel := context.WithTimeout(context.Background(), defaultRequestTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to perform HTTP request: %w", err) } for k, v := range headers { req.Header.Set(k, v) } res, err := httpClient.Do(req) if err != nil { return nil, err } defer func() { if err := res.Body.Close(); err != nil { logger.Error("failed to close response body", zap.Error(err)) } }() if res.StatusCode >= http.StatusBadRequest { return nil, fmt.Errorf("http request failed, statusCode='%d'", res.StatusCode) } bodyBytes, err := ioutil.ReadAll(res.Body) if err != nil { return nil, fmt.Errorf("failed to read body bytes: %w", err) } return bodyBytes, nil } func newDefaultLinkPreview(url *neturl.URL) common.LinkPreview { return common.LinkPreview{ URL: url.String(), Hostname: url.Hostname(), } } func fetchThumbnail(logger *zap.Logger, httpClient http.Client, url string) (common.LinkPreviewThumbnail, error) { var thumbnail common.LinkPreviewThumbnail imgBytes, err := fetchBody(logger, httpClient, url, nil) if err != nil { return thumbnail, fmt.Errorf("could not fetch thumbnail: %w", err) } width, height, err := images.GetImageDimensions(imgBytes) if err != nil { return thumbnail, fmt.Errorf("could not get image dimensions: %w", err) } thumbnail.Width = width thumbnail.Height = height dataURI, err := images.GetPayloadDataURI(imgBytes) if err != nil { return thumbnail, fmt.Errorf("could not build data URI: %w", err) } thumbnail.DataURI = dataURI return thumbnail, nil } type OEmbedUnfurler struct { logger *zap.Logger httpClient http.Client // oembedEndpoint describes where the consumer may request representations for // the supported URL scheme. For example, for YouTube, it is // https://www.youtube.com/oembed. oembedEndpoint string // url is the actual URL to be unfurled. url *neturl.URL } type OEmbedResponse struct { Title string `json:"title"` ThumbnailURL string `json:"thumbnail_url"` } func (u OEmbedUnfurler) newOEmbedURL() (*neturl.URL, error) { oembedURL, err := neturl.Parse(u.oembedEndpoint) if err != nil { return nil, err } // When format is specified, the provider MUST return data in the requested // format, else return an error. oembedURL.RawQuery = neturl.Values{ "url": {u.url.String()}, "format": {"json"}, }.Encode() return oembedURL, nil } func (u OEmbedUnfurler) unfurl() (common.LinkPreview, error) { preview := newDefaultLinkPreview(u.url) oembedURL, err := u.newOEmbedURL() if err != nil { return preview, err } headers := map[string]string{ "accept": headerAcceptJSON, "accept-language": headerAcceptLanguage, "user-agent": headerUserAgent, } oembedBytes, err := fetchBody(u.logger, u.httpClient, oembedURL.String(), headers) if err != nil { return preview, err } var oembedResponse OEmbedResponse if err != nil { return preview, err } err = json.Unmarshal(oembedBytes, &oembedResponse) if err != nil { return preview, err } if oembedResponse.Title == "" { return preview, fmt.Errorf("missing required title in oEmbed response") } preview.Title = oembedResponse.Title return preview, nil } type OpenGraphMetadata struct { Title string `json:"title" meta:"og:title"` Description string `json:"description" meta:"og:description"` ThumbnailURL string `json:"thumbnailUrl" meta:"og:image"` } // OpenGraphUnfurler should be preferred over OEmbedUnfurler because oEmbed // gives back a JSON response with a "html" field that's supposed to be embedded // in an iframe (hardly useful for existing Status' clients). type OpenGraphUnfurler struct { url *neturl.URL logger *zap.Logger httpClient http.Client } func (u OpenGraphUnfurler) unfurl() (common.LinkPreview, error) { preview := newDefaultLinkPreview(u.url) headers := map[string]string{ "accept": headerAcceptText, "accept-language": headerAcceptLanguage, "user-agent": headerUserAgent, } bodyBytes, err := fetchBody(u.logger, u.httpClient, u.url.String(), headers) if err != nil { return preview, err } var ogMetadata OpenGraphMetadata err = metabolize.Metabolize(ioutil.NopCloser(bytes.NewBuffer(bodyBytes)), &ogMetadata) if err != nil { return preview, fmt.Errorf("failed to parse OpenGraph data") } // There are URLs like https://wikipedia.org/ that don't have an OpenGraph // title tag, but article pages do. In the future, we can fallback to the // website's title by using the