package urls import ( "strings" "unicode/utf8" ) const ( htmlTagStart = 60 // Unicode `<` htmlTagEnd = 62 // Unicode `>` ) // Taken from https://stackoverflow.com/a/64701836 // Aggressively strips HTML tags from a string. // It will only keep anything between `>` and `<`. func stripHTMLTags(s string) string { // Setup a string builder and allocate enough memory for the new string. var builder strings.Builder builder.Grow(len(s) + utf8.UTFMax) in := false // True if we are inside an HTML tag. start := 0 // The index of the previous start tag character `<` end := 0 // The index of the previous end tag character `>` for i, c := range s { // If this is the last character and we are not in an HTML tag, save it. if (i+1) == len(s) && end >= start { builder.WriteString(s[end:]) } // Keep going if the character is not `<` or `>` if c != htmlTagStart && c != htmlTagEnd { continue } if c == htmlTagStart { // Only update the start if we are not in a tag. // This make sure we strip out `<
` not just `
` if !in { start = i } in = true // Write the valid string between the close and start of the two tags. builder.WriteString(s[end:start]) continue } // else c == htmlTagEnd in = false end = i + 1 } s = builder.String() return s }