~netlandish/links-dev

links: Fix bug when invalid UTF8 characters are passed via external site html tags. v1 APPLIED

Peter Sanchez: 1
 Fix bug when invalid UTF8 characters are passed via external site html tags.

 1 files changed, 9 insertions(+), 20 deletions(-)
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.code.netlandish.com/~netlandish/links-dev/patches/176/mbox | git am -3
Learn more about email & git

[PATCH links] Fix bug when invalid UTF8 characters are passed via external site html tags. Export this patch

Changelog-fixed: Bug processing invalid UTF8 characters when parsing
  website metadata.
---
 helpers.go | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/helpers.go b/helpers.go
index 431d08a..e086eb2 100644
--- a/helpers.go
+++ b/helpers.go
@@ -326,33 +326,33 @@ func extract(resp io.Reader) *models.HTMLMeta {
			if t.Data == "meta" {
				desc, ok := extractMetaProperty(t, "description")
				if ok {
					hm.Description = core.StripHtmlTags(desc)
					hm.Description = SanitizeUTF8(core.StripHtmlTags(desc))
				}

				ogTitle, ok := extractMetaProperty(t, "og:title")
				if ok {
					hm.Title = core.StripHtmlTags(ogTitle)
					hm.Title = SanitizeUTF8(core.StripHtmlTags(ogTitle))
				}

				ogDesc, ok := extractMetaProperty(t, "og:description")
				if ok {
					hm.Description = core.StripHtmlTags(ogDesc)
					hm.Description = SanitizeUTF8(core.StripHtmlTags(ogDesc))
				}

				ogImage, ok := extractMetaProperty(t, "og:image")
				if ok {
					hm.Image = core.StripHtmlTags(ogImage)
					hm.Image = SanitizeUTF8(core.StripHtmlTags(ogImage))
				}

				ogSiteName, ok := extractMetaProperty(t, "og:site_name")
				if ok {
					hm.SiteName = core.StripHtmlTags(ogSiteName)
					hm.SiteName = SanitizeUTF8(core.StripHtmlTags(ogSiteName))
				}
			}
		case html.TextToken:
			if titleFound {
				t := z.Token()
				hm.Title = core.StripHtmlTags(t.Data)
				hm.Title = SanitizeUTF8(core.StripHtmlTags(t.Data))
				titleFound = false
			}
		}
@@ -1212,24 +1212,13 @@ func IPForContext(ctx context.Context) string {
	return ip
}

// SanitizeUTF8 will strip out invalid utf-8 characters
// SanitizeUTF8 will strip out invalid utf-8 characters.
// Invalid UTF-8 bytes are replaced with the Unicode replacement character.
func SanitizeUTF8(input string) string {
	if utf8.ValidString(input) {
		return input
	}

	var b strings.Builder
	for i := 0; i < len(input); {
		r, size := utf8.DecodeRuneInString(input[i:])
		if r == utf8.RuneError && size == 1 {
			// Replace invalid bytes with a space
			b.WriteString(" ")
		} else {
			b.WriteRune(r)
		}
		i += size
	}
	return b.String()
	return strings.ToValidUTF8(input, "�")
}

// StripURLFragment will simply return a URL without any fragment options
-- 
2.49.1
Applied.

To git@git.code.netlandish.com:~netlandish/links
   47cf073..53069b2  master -> master