[PATCH links] Fix bug when invalid UTF8 characters are passed via external site html tags.
Export this patch
Changelog-fixed: Bug processing invalid UTF8 characters when parsing
website metadata.
---
helpers.go | 29 +++++++++--------------------
1 file changed, 9 insertions(+), 20 deletions(-)
diff --git a/helpers.go b/helpers.go
index 431d08a..e086eb2 100644
--- a/helpers.go
+++ b/helpers.go
@@ -326,33 +326,33 @@ func extract(resp io.Reader) *models.HTMLMeta {
if t.Data == "meta" {
desc, ok := extractMetaProperty(t, "description")
if ok {
- hm.Description = core.StripHtmlTags(desc)
+ hm.Description = SanitizeUTF8(core.StripHtmlTags(desc))
}
ogTitle, ok := extractMetaProperty(t, "og:title")
if ok {
- hm.Title = core.StripHtmlTags(ogTitle)
+ hm.Title = SanitizeUTF8(core.StripHtmlTags(ogTitle))
}
ogDesc, ok := extractMetaProperty(t, "og:description")
if ok {
- hm.Description = core.StripHtmlTags(ogDesc)
+ hm.Description = SanitizeUTF8(core.StripHtmlTags(ogDesc))
}
ogImage, ok := extractMetaProperty(t, "og:image")
if ok {
- hm.Image = core.StripHtmlTags(ogImage)
+ hm.Image = SanitizeUTF8(core.StripHtmlTags(ogImage))
}
ogSiteName, ok := extractMetaProperty(t, "og:site_name")
if ok {
- hm.SiteName = core.StripHtmlTags(ogSiteName)
+ hm.SiteName = SanitizeUTF8(core.StripHtmlTags(ogSiteName))
}
}
case html.TextToken:
if titleFound {
t := z.Token()
- hm.Title = core.StripHtmlTags(t.Data)
+ hm.Title = SanitizeUTF8(core.StripHtmlTags(t.Data))
titleFound = false
}
}
@@ -1212,24 +1212,13 @@ func IPForContext(ctx context.Context) string {
return ip
}
-// SanitizeUTF8 will strip out invalid utf-8 characters
+// SanitizeUTF8 will strip out invalid utf-8 characters.
+// Invalid UTF-8 bytes are replaced with the Unicode replacement character.
func SanitizeUTF8(input string) string {
if utf8.ValidString(input) {
return input
}
-
- var b strings.Builder
- for i := 0; i < len(input); {
- r, size := utf8.DecodeRuneInString(input[i:])
- if r == utf8.RuneError && size == 1 {
- // Replace invalid bytes with a space
- b.WriteString(" ")
- } else {
- b.WriteRune(r)
- }
- i += size
- }
- return b.String()
+ return strings.ToValidUTF8(input, "�")
}
// StripURLFragment will simply return a URL without any fragment options
--
2.49.1
Applied.
To git@git.code.netlandish.com:~netlandish/links
47cf073..53069b2 master -> master