package quant.rich.emoney.util; import org.jsoup.Jsoup; import org.jsoup.nodes.*; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; import java.util.Iterator; import java.util.Random; public class HtmlSanitizer { public static final Random random = new Random(); public static String sanitize(String unsafeHtml) { String nlRandom = new StringBuilder("__NL").append(random.nextInt()).append("__").toString(); unsafeHtml = unsafeHtml.replaceAll("\n", nlRandom); Document doc = Jsoup.parseBodyFragment(unsafeHtml); Element body = doc.body(); NodeTraversor.traverse(new NodeVisitor() { public void head(Node node, int depth) { if (!(node instanceof Element)) return; Element el = (Element) node; String tag = el.tagName().toLowerCase(); // 清洗 if (tag.equals("link")) { String href = el.attr("href"); if (isExternalUrl(href)) { replaceWithNotice(el, "link[href]"); return; } } // 清洗危险结构元素 switch (tag) { case "script": case "iframe": case "object": case "embed": case "video": case "audio": case "source": case "meta": case "fencedframe": replaceWithNotice(el, tag); return; } // 清除 on* 属性 Iterator it = el.attributes().iterator(); while (it.hasNext()) { Attribute attr = it.next(); if (attr.getKey().toLowerCase().startsWith("on")) { it.remove(); } } // 清除 src 外链 if (el.hasAttr("src")) { String src = el.attr("src"); if (isExternalUrl(src)) { el.removeAttr("src"); el.appendChild(new TextNode("[已移除外链资源]")); } } // 清除 style="...url(...)" 中的非 base64 URL if (el.hasAttr("style")) { String style = el.attr("style"); if (containsExternalUrl(style)) { style = style.replaceAll("(?i)url\\((?!['\"]?data:)[^)]*\\)", ""); el.attr("style", style + " /* 外链背景图已清除 */"); } } // a 标签添加 rel="noreferrer" if (tag.equals("a")) { String rel = el.attr("rel"); if (!rel.toLowerCase().contains("noreferrer")) { el.attr("rel", "noreferrer"); } } // style 标签内容清理 url(...) if (tag.equals("style")) { for (Node child : el.childNodes()) { if (child instanceof TextNode) { TextNode text = (TextNode) child; String content = text.getWholeText(); // 移除非 base64 的 url(...) String cleaned = content.replaceAll("(?i)url\\((?!['\"]?data:)[^)]*\\)", "/* 外链已清除 */"); text.text(cleaned); } } } } public void tail(Node node, int depth) { // nothing } }, body); // SVG、MathML 类危险标签 doc.select("svg, math, foreignObject").forEach(el -> replaceWithNotice(el, el.tagName()) ); unsafeHtml = body.html(); unsafeHtml = unsafeHtml.replaceAll(nlRandom, "\n"); return unsafeHtml; } private static boolean isExternalUrl(String url) { return url != null && !url.trim().toLowerCase().startsWith("data:") && (url.startsWith("http://") || url.startsWith("https://") || url.startsWith("//")); } private static boolean containsExternalUrl(String style) { return style != null && style.toLowerCase().matches(".*url\\((?!['\"]?data:).+\\).*"); } private static void replaceWithNotice(Element el, String tagName) { el.replaceWith(new Element("div") .addClass("blocked-element") .text("该 " + tagName + " 元素可能包含危害安全的内容,已屏蔽。")); } }