Received: from mail.netlandish.com (mail.netlandish.com [174.136.98.166]) by code.netlandish.com (Postfix) with ESMTP id 92C2E11BE for <~netlandish/links-dev@lists.code.netlandish.com>; Fri, 22 Aug 2025 18:30:34 +0000 (UTC) Received-SPF: Pass (mailfrom) identity=mailfrom; client-ip=209.85.217.48; helo=mail-vs1-f48.google.com; envelope-from=peter@netlandish.com; receiver= Authentication-Results: mail.netlandish.com; dkim=pass (1024-bit key; unprotected) header.d=netlandish.com header.i=@netlandish.com header.b=gRz+bzcv Received: from mail-vs1-f48.google.com (mail-vs1-f48.google.com [209.85.217.48]) by mail.netlandish.com (Postfix) with ESMTP id E64921D67F2 for <~netlandish/links-dev@lists.code.netlandish.com>; Fri, 22 Aug 2025 18:31:28 +0000 (UTC) Received: by mail-vs1-f48.google.com with SMTP id ada2fe7eead31-50f890e9054so1603827137.1 for <~netlandish/links-dev@lists.code.netlandish.com>; Fri, 22 Aug 2025 11:31:28 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=netlandish.com; s=google; t=1755887488; x=1756492288; darn=lists.code.netlandish.com; h=content-transfer-encoding:mime-version:message-id:date:subject:cc :to:from:from:to:cc:subject:date:message-id:reply-to; bh=2DTgSP1d+H8DHCksSRxk3Od5NGGHPZdY41q566hMWfw=; b=gRz+bzcvzaSfYm7zKacY0Pd5OOIzLPgD91Dt771BN6g+KIXiB9+LMuBesmzQ6DsO99 r+zek24nFxICvd7MfrB8hFFUx5q8Q+RTkcW723S7Cilkxv2wCsxO3arD0W38cIjzAKQh 0hQyWTJ+Rdb4j/zB/F388pQjVfnFGxKWfAGPI= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1755887488; x=1756492288; h=content-transfer-encoding:mime-version:message-id:date:subject:cc :to:from:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=2DTgSP1d+H8DHCksSRxk3Od5NGGHPZdY41q566hMWfw=; b=FaytfNbBQl+gjaSvyD9N9DtzSCyU6tay/y92HPN1EjJlKgTytT9uCeosN7cTdUFwgW jWOb6QBWiRzq5cSF4xUG78BBZ1rBbuYRbXTC5hFKr2DJzMhPBoc3zThbWwdX7fz6Q7TI vKf5tnnQb5X/sH3XVgbdOmqbZikc05JIieYhQq0MN+SJ5h9QiofMMK9xv6vpdAKaKcB3 MLsPz7s45H4bkxYCHSpDkxyI9AOba3KblvKcZSGb2dW0+Kf1exOmHxJNfi9BAMBXJBgg BqLQnIq8raGa2L/6y/EdjLZP65vPlCbEEirlCS/jyd9Rd8ickKjLS/VxOsnbG3nyb+k6 7pYA== X-Gm-Message-State: AOJu0YzIBfRRc2PpJIut6/ckEb37tC4+e2NpMBiJEXuveCPgYsOxiVJm 08XHAXTyQADss/qlncCuHJQgYSihwTVzI4QxYfmcKpUle8b7Y8EttYX9eOqeWJeyraJLAMQJlbe vLZM6sV17uw== X-Gm-Gg: ASbGncu9YLpjy/cRJzKEzvIQiVVpBH5p/zhfrBRdlfnk7TOIn8wn0qgeQ1Mdo9n2UHI +Gwgeiq1c5ebUwAtUBwGsrN7QJr+AQ7JsbfbI94yOnJlr0xGXB6ECpp5d2rjAHbkWCeZOAg6e+h Jgd5zftABx92bJ9nZO5wqWdlWcMc44prRF1txW7i4lBlNH/hafyUBDRwqLUqcczoGyI4f7WdV0t COkIt97c3w227I2gxKfXXVtiB5GU4PfOy2HdSxBCHImpgVoq7afM5+XDjOSzH+0UFuLaOSRNVLp 8AYi+Cl11Wt5oWoPYStDU/BX1YQUz3D+l3wITurN7qw+xlklJmMzdGgLglxu2j8JMzXSXdeOEwU h9DtTPqoEVib2Zv0V2Y6GSTQ= X-Google-Smtp-Source: AGHT+IEjamqjXgekUYZoybn7fCxfz6SYbuVRcliPNsgkw8tj0X7U93z9oJpLTjUz6pfhXjinlQKpXA== X-Received: by 2002:a05:6102:b11:b0:511:db31:acc2 with SMTP id ada2fe7eead31-51d0c8c44dcmr1235014137.4.1755887487763; Fri, 22 Aug 2025 11:31:27 -0700 (PDT) Received: from localhost ([2803:2d60:1118:5ee:76f5:937d:79dd:b762]) by smtp.gmail.com with UTF8SMTPSA id a1e0cc1a2514c-892374d079bsm58998241.15.2025.08.22.11.31.27 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 22 Aug 2025 11:31:27 -0700 (PDT) From: Peter Sanchez To: ~netlandish/links-dev@lists.code.netlandish.com Cc: Peter Sanchez Subject: [PATCH links] Fix bug when invalid UTF8 characters are passed via external site html tags. Date: Fri, 22 Aug 2025 12:31:21 -0600 Message-ID: <20250822183125.17329-1-peter@netlandish.com> X-Mailer: git-send-email 2.49.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changelog-fixed: Bug processing invalid UTF8 characters when parsing website metadata. --- helpers.go | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/helpers.go b/helpers.go index 431d08a..e086eb2 100644 --- a/helpers.go +++ b/helpers.go @@ -326,33 +326,33 @@ func extract(resp io.Reader) *models.HTMLMeta { if t.Data == "meta" { desc, ok := extractMetaProperty(t, "description") if ok { - hm.Description = core.StripHtmlTags(desc) + hm.Description = SanitizeUTF8(core.StripHtmlTags(desc)) } ogTitle, ok := extractMetaProperty(t, "og:title") if ok { - hm.Title = core.StripHtmlTags(ogTitle) + hm.Title = SanitizeUTF8(core.StripHtmlTags(ogTitle)) } ogDesc, ok := extractMetaProperty(t, "og:description") if ok { - hm.Description = core.StripHtmlTags(ogDesc) + hm.Description = SanitizeUTF8(core.StripHtmlTags(ogDesc)) } ogImage, ok := extractMetaProperty(t, "og:image") if ok { - hm.Image = core.StripHtmlTags(ogImage) + hm.Image = SanitizeUTF8(core.StripHtmlTags(ogImage)) } ogSiteName, ok := extractMetaProperty(t, "og:site_name") if ok { - hm.SiteName = core.StripHtmlTags(ogSiteName) + hm.SiteName = SanitizeUTF8(core.StripHtmlTags(ogSiteName)) } } case html.TextToken: if titleFound { t := z.Token() - hm.Title = core.StripHtmlTags(t.Data) + hm.Title = SanitizeUTF8(core.StripHtmlTags(t.Data)) titleFound = false } } @@ -1212,24 +1212,13 @@ func IPForContext(ctx context.Context) string { return ip } -// SanitizeUTF8 will strip out invalid utf-8 characters +// SanitizeUTF8 will strip out invalid utf-8 characters. +// Invalid UTF-8 bytes are replaced with the Unicode replacement character. func SanitizeUTF8(input string) string { if utf8.ValidString(input) { return input } - - var b strings.Builder - for i := 0; i < len(input); { - r, size := utf8.DecodeRuneInString(input[i:]) - if r == utf8.RuneError && size == 1 { - // Replace invalid bytes with a space - b.WriteString(" ") - } else { - b.WriteRune(r) - } - i += size - } - return b.String() + return strings.ToValidUTF8(input, "�") } // StripURLFragment will simply return a URL without any fragment options -- 2.49.1