Received: from mail.netlandish.com (mail.netlandish.com [174.136.98.166]) by code.netlandish.com (Postfix) with ESMTP id 68370141 for <~netlandish/links-dev@lists.code.netlandish.com>; Thu, 27 Feb 2025 00:55:57 +0000 (UTC) Received-SPF: Pass (mailfrom) identity=mailfrom; client-ip=209.85.128.178; helo=mail-yw1-f178.google.com; envelope-from=peter@netlandish.com; receiver= Authentication-Results: mail.netlandish.com; dkim=pass (1024-bit key; unprotected) header.d=netlandish.com header.i=@netlandish.com header.b=hjbhXxuu Received: from mail-yw1-f178.google.com (mail-yw1-f178.google.com [209.85.128.178]) by mail.netlandish.com (Postfix) with ESMTP id C94781D80C1 for <~netlandish/links-dev@lists.code.netlandish.com>; Thu, 27 Feb 2025 01:04:12 +0000 (UTC) Received: by mail-yw1-f178.google.com with SMTP id 00721157ae682-6fcf90d09c6so4371437b3.0 for <~netlandish/links-dev@lists.code.netlandish.com>; Wed, 26 Feb 2025 17:04:12 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=netlandish.com; s=google; t=1740618252; x=1741223052; darn=lists.code.netlandish.com; h=content-transfer-encoding:mime-version:message-id:date:subject:cc :to:from:from:to:cc:subject:date:message-id:reply-to; bh=Vv5DflX1gHQpm5Db5kYxMhDig4kgpACujPL4wdFC8Ic=; b=hjbhXxuuxmiCrheeBHfY2/1AfFsyj8rmllUvNxlWs+5axMTZUPcEcKb6+OgncNJQR2 Zno2PVVYzL2Jix3Bw3/w5FdSlGyuPckPnbmvuZYeFIKuz7DWH1ygRXfqTGDn30L/ilYu /vJlCTunoRM08vi+ULVWSpYhTcCMUXYY8LpZI= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1740618252; x=1741223052; h=content-transfer-encoding:mime-version:message-id:date:subject:cc :to:from:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=Vv5DflX1gHQpm5Db5kYxMhDig4kgpACujPL4wdFC8Ic=; b=VYkHpvok0j6OjQA5RS3PtTOxauhEwyVoLgf8TOy0kv1TD0Z3d+WVY+D1orgtHInMCF RQm4uebCI09m6Fp0AqVYaXvDB5JR8+RZoxz/ylEKjDOQNDvo7ymKvk2FAUK0b0uzZd5s HHl31OvVfITOG3yWcLY3+VrLu3HIB/3x8cTVkT3TPi7WHjHcFNBd1RxdI6Sy26HiOaot uinYBEH5dt8rjjPK/CqD5JnCkw1mV2CR33OVXnUar5HQWnRG+hwKwYK0BtSmB9RQwc5l hcMp17mG/Fpl+9aNC3sKGkmZrnMf1eT/sC22NoTiGypD6rB76n4HLoFidn3VCD7q8J83 xj1g== X-Gm-Message-State: AOJu0Yy4JfM7OXFcEd529M98c5qziqYmuBhWEosfAnK15ivz53X2ZuxN j+/XesJocMYFE/cykw3NDqwBd64oLdAXoe2ue7/OToTCPoav7Mz/w+tdOirCxEDox5pYomTHwB2 gbDg= X-Gm-Gg: ASbGncvrMfieoGBzHYaAy2WcVH0UjH+x3g7/UXyC4f7A28nClxPLWs6Tyxhnk5ChOH7 S6OhGVmXk893G0kTlBHCp2IUe1lOs39PcV+6JepQeLbJS4UCH2wpTWTyLR5zetwH8VAUOUP5WRH 2/QW0QcSnZoEfNbzMnosuD/vkjhhl+fZwukLAoGkZT2OdflkRlbfsbyB6YWYzusfO0yigLpKK5U wV92sxeKlAgzNwR2TxTJsWdJKte3xZy7b/Pe26EtVRoloGvBd2w5BQyZYoAt7+p47zVfnb98r8F BdxwcqgJimqpRAEfkYheK20Ih3I= X-Google-Smtp-Source: AGHT+IF14x8L1UE4HmNbBvUA06ldTAy1tBNf6boiungoyLZuJdZpF0sed3QovsOK79wGWd4ZhBjrFg== X-Received: by 2002:a05:690c:8c17:b0:6f9:e4e1:a86 with SMTP id 00721157ae682-6fd395f0e8cmr15241037b3.16.1740618251937; Wed, 26 Feb 2025 17:04:11 -0800 (PST) Received: from localhost ([2803:2d60:1118:5ee:51ec:9bc1:b292:c8d9]) by smtp.gmail.com with ESMTPSA id 00721157ae682-6fd3cb7e1f9sm583197b3.88.2025.02.26.17.04.11 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 26 Feb 2025 17:04:11 -0800 (PST) From: Peter Sanchez To: ~netlandish/links-dev@lists.code.netlandish.com Cc: Peter Sanchez Subject: [PATCH links] Add utf8 sanitizer to address edge case import errors. Date: Wed, 26 Feb 2025 19:03:49 -0600 Message-ID: <20250227010406.32668-1-peter@netlandish.com> X-Mailer: git-send-email 2.47.2 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Fixes: https://todo.code.netlandish.com/~netlandish/links/96 Signed-off-by: Peter Sanchez --- core/import.go | 5 +++-- helpers.go | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/core/import.go b/core/import.go index 1671f47..ae8e14e 100644 --- a/core/import.go +++ b/core/import.go @@ -114,6 +114,7 @@ func (p pinBoardObj) GetTags() []string { func trimTags(tags []string) []string { var ret []string for _, t := range tags { + t = links.SanitizeUTF8(t) if len(t) > 50 { t = t[:50] } @@ -300,9 +301,9 @@ func processOrgLinks(obj importObj, baseURLMap map[string]int, title = title[:146] + "..." } return &models.OrgLink{ - Title: title, + Title: links.SanitizeUTF8(title), URL: obj.GetURL(), - Description: obj.GetDescription(), + Description: links.SanitizeUTF8(obj.GetDescription()), BaseURLID: sql.NullInt64{Valid: true, Int64: int64(baseID)}, OrgID: org.ID, UserID: int(user.ID), diff --git a/helpers.go b/helpers.go index 36a6eb7..125698a 100644 --- a/helpers.go +++ b/helpers.go @@ -23,6 +23,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" "git.sr.ht/~emersion/gqlclient" "github.com/99designs/gqlgen/graphql" @@ -1159,3 +1160,23 @@ func IPForContext(ctx context.Context) string { } return ip } + +// SanitizeUTF8 will strip out invalid utf-8 characters +func SanitizeUTF8(input string) string { + if utf8.ValidString(input) { + return input + } + + var b strings.Builder + for i := 0; i < len(input); { + r, size := utf8.DecodeRuneInString(input[i:]) + if r == utf8.RuneError && size == 1 { + // Replace invalid bytes with a space + b.WriteString(" ") + } else { + b.WriteRune(r) + } + i += size + } + return b.String() +} -- 2.47.2