summaryrefslogtreecommitdiff
path: root/shared/components/src/utils/sanitize-html/common.ts
diff options
context:
space:
mode:
authorrxliuli <rxliuli@gmail.com>2025-11-04 05:03:50 +0800
committerrxliuli <rxliuli@gmail.com>2025-11-04 05:03:50 +0800
commitbce557cc2dc767628bed6aac87301a1be7c5431b (patch)
treeb51a051228d01fe3306cd7626d4a96768aadb944 /shared/components/src/utils/sanitize-html/common.ts
init commit
Diffstat (limited to 'shared/components/src/utils/sanitize-html/common.ts')
-rw-r--r--shared/components/src/utils/sanitize-html/common.ts176
1 files changed, 176 insertions, 0 deletions
diff --git a/shared/components/src/utils/sanitize-html/common.ts b/shared/components/src/utils/sanitize-html/common.ts
new file mode 100644
index 0000000..38b3b2e
--- /dev/null
+++ b/shared/components/src/utils/sanitize-html/common.ts
@@ -0,0 +1,176 @@
+type AllowedTags = Set<string>;
+
+interface AllowedAttributes {
+ [tagName: string]: Set<string>;
+}
+
+export interface SanitizeHtmlOptions {
+ allowedTags?: string[];
+ extraAllowedTags?: string[];
+ keepChildrenWhenRemovingParent?: boolean;
+
+ /**
+ * When true, replaces all &nbsp; entities with regular spaces
+ * to prevent unwanted line breaks in the rendered HTML
+ */
+ removeNbsp?: boolean;
+
+ /**
+ * AllowedAttributes should be an object with tag name keys and array values
+ * containing all of the attributes allowed for that tag:
+ *
+ * { 'p': ['class'], 'div': ['role', 'aria-hidden'] }
+ *
+ * The above allows ONLY the class attribute for <p> and ONLY the role and
+ * aria-hidden attributes for <div>.
+ */
+ allowedAttributes?: {
+ [tagName: string]: string[];
+ };
+}
+
+export const DEFAULT_SAFE_TAGS: string[] = [
+ 'strong',
+ 'em',
+ 'b',
+ 'i',
+ 'u',
+ 'br',
+];
+const DEFAULT_SAFE_ATTRS = {};
+
+/**
+ * Sanitizes HTML by removing all tags and attributes that aren't explicitly allowed.
+ */
+export function sanitizeDocument(
+ unsafeDocument: Document,
+ unsafeNode: Node | DocumentFragment,
+ {
+ allowedTags,
+ extraAllowedTags,
+ allowedAttributes = DEFAULT_SAFE_ATTRS,
+ keepChildrenWhenRemovingParent,
+ removeNbsp,
+ }: SanitizeHtmlOptions = {},
+): string {
+ if (allowedTags && extraAllowedTags) {
+ throw new Error(
+ 'sanitizeHtml got both allowedTags and extraAllowedTags',
+ );
+ }
+
+ const allowedTagsSet = new Set([
+ ...(extraAllowedTags || []),
+ ...(allowedTags || DEFAULT_SAFE_TAGS),
+ ]);
+
+ const allowedAttributeSets = {};
+ for (const [tag, attributes] of Object.entries(allowedAttributes)) {
+ allowedAttributeSets[tag] = new Set(attributes);
+ }
+
+ const sanitizedContainer = unsafeDocument.createElement('div');
+
+ for (const child of [...unsafeNode.childNodes]) {
+ const sanitizedChildArray = sanitizeNode(
+ child as Element,
+ allowedTagsSet,
+ allowedAttributeSets,
+ keepChildrenWhenRemovingParent,
+ );
+ sanitizedChildArray.forEach((node) => {
+ sanitizedContainer.appendChild(node);
+ });
+ }
+
+ let html = sanitizedContainer.innerHTML;
+
+ // Replace &nbsp; with regular spaces if removeNbsp option is enabled
+ if (removeNbsp) {
+ html = html.replace(/&nbsp;/g, ' ');
+ }
+
+ return html;
+}
+
+function sanitizeNode(
+ node: Element,
+ allowedTags: AllowedTags,
+ allowedAttributes: AllowedAttributes,
+ keepChildrenWhenRemovingParent: boolean,
+): Node[] | Element[] {
+ // Plain text is safe as is
+ // NOTE: The lowercase node (instead of Node) is intentional. Node is only
+ // accessible in browser. In Node.js, it depends on jsdom (which we
+ // avoid importing to exclude from the clientside vendor bundle).
+ // Instead of passing down window.Node or jsdom.Node depending on
+ // context, we rely on the fact that instances of Node (of which node
+ // will be one) will also have these constants set on them.
+ if (
+ ([node.TEXT_NODE, node.CDATA_SECTION_NODE] as number[]).includes(
+ node.nodeType,
+ )
+ ) {
+ return [node];
+ }
+
+ // Refuse anything that isn't a tag or one of the allowed tags
+ const tagName = (node.tagName || '').toLowerCase();
+
+ if (!allowedTags.has(tagName)) {
+ // when keepChildrenWhenRemovingParent is true
+ // we check children for valid nodes as well
+ if (keepChildrenWhenRemovingParent) {
+ return sanitizeChildren(
+ node,
+ allowedTags,
+ allowedAttributes,
+ keepChildrenWhenRemovingParent,
+ );
+ }
+ return [];
+ }
+
+ // Reconstruct node with only the allowedAttributes and sanitize its children
+ const sanitized = node.ownerDocument.createElement(tagName);
+ const currentlyAllowedAttributes = allowedAttributes[tagName] || new Set();
+
+ for (const { name, nodeValue: value } of [...node.attributes]) {
+ if (currentlyAllowedAttributes.has(name)) {
+ sanitized.setAttribute(name, value);
+ }
+ }
+
+ const children = sanitizeChildren(
+ node,
+ allowedTags,
+ allowedAttributes,
+ keepChildrenWhenRemovingParent,
+ );
+
+ children.forEach((child) => {
+ sanitized.appendChild(child);
+ });
+
+ return [sanitized];
+}
+
+const sanitizeChildren = (
+ node: Element,
+ allowedTags: AllowedTags,
+ allowedAttributes: AllowedAttributes,
+ tagsToConvertToText: boolean,
+): Node[] => {
+ const children = [...node.childNodes]
+ .map((childNode) =>
+ sanitizeNode(
+ childNode as Element,
+ allowedTags,
+ allowedAttributes,
+ tagsToConvertToText,
+ ),
+ )
+ .flat();
+
+ return children;
+};