135 lines
5.0 KiB
Java
135 lines
5.0 KiB
Java
package quant.rich.emoney.util;
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.*;
|
|
import org.jsoup.select.NodeTraversor;
|
|
import org.jsoup.select.NodeVisitor;
|
|
|
|
import java.util.Iterator;
|
|
import java.util.Random;
|
|
|
|
public class HtmlSanitizer {
|
|
|
|
public static final Random random = new Random();
|
|
|
|
public static String sanitize(String unsafeHtml) {
|
|
String nlRandom =
|
|
new StringBuilder("__NL").append(random.nextInt()).append("__").toString();
|
|
unsafeHtml = unsafeHtml.replaceAll("\n", nlRandom);
|
|
Document doc = Jsoup.parseBodyFragment(unsafeHtml);
|
|
Element body = doc.body();
|
|
|
|
NodeTraversor.traverse(new NodeVisitor() {
|
|
public void head(Node node, int depth) {
|
|
if (!(node instanceof Element)) return;
|
|
Element el = (Element) node;
|
|
String tag = el.tagName().toLowerCase();
|
|
|
|
|
|
|
|
// 清洗 <link rel="stylesheet" href="...">
|
|
if (tag.equals("link")) {
|
|
String href = el.attr("href");
|
|
if (isExternalUrl(href)) {
|
|
replaceWithNotice(el, "link[href]");
|
|
return;
|
|
}
|
|
}
|
|
|
|
// 清洗危险结构元素
|
|
switch (tag) {
|
|
case "script":
|
|
case "iframe":
|
|
case "object":
|
|
case "embed":
|
|
case "video":
|
|
case "audio":
|
|
case "source":
|
|
case "meta":
|
|
case "fencedframe":
|
|
replaceWithNotice(el, tag);
|
|
return;
|
|
}
|
|
|
|
// 清除 on* 属性
|
|
Iterator<Attribute> it = el.attributes().iterator();
|
|
while (it.hasNext()) {
|
|
Attribute attr = it.next();
|
|
if (attr.getKey().toLowerCase().startsWith("on")) {
|
|
it.remove();
|
|
}
|
|
}
|
|
|
|
// 清除 src 外链
|
|
if (el.hasAttr("src")) {
|
|
String src = el.attr("src");
|
|
if (isExternalUrl(src)) {
|
|
el.removeAttr("src");
|
|
el.appendChild(new TextNode("[已移除外链资源]"));
|
|
}
|
|
}
|
|
|
|
// 清除 style="...url(...)" 中的非 base64 URL
|
|
if (el.hasAttr("style")) {
|
|
String style = el.attr("style");
|
|
if (containsExternalUrl(style)) {
|
|
style = style.replaceAll("(?i)url\\((?!['\"]?data:)[^)]*\\)", "");
|
|
el.attr("style", style + " /* 外链背景图已清除 */");
|
|
}
|
|
}
|
|
|
|
// a 标签添加 rel="noreferrer"
|
|
if (tag.equals("a")) {
|
|
String rel = el.attr("rel");
|
|
if (!rel.toLowerCase().contains("noreferrer")) {
|
|
el.attr("rel", "noreferrer");
|
|
}
|
|
}
|
|
|
|
// style 标签内容清理 url(...)
|
|
if (tag.equals("style")) {
|
|
for (Node child : el.childNodes()) {
|
|
if (child instanceof TextNode) {
|
|
TextNode text = (TextNode) child;
|
|
String content = text.getWholeText();
|
|
// 移除非 base64 的 url(...)
|
|
String cleaned = content.replaceAll("(?i)url\\((?!['\"]?data:)[^)]*\\)", "/* 外链已清除 */");
|
|
text.text(cleaned);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public void tail(Node node, int depth) {
|
|
// nothing
|
|
}
|
|
}, body);
|
|
|
|
// SVG、MathML 类危险标签
|
|
doc.select("svg, math, foreignObject").forEach(el ->
|
|
replaceWithNotice(el, el.tagName())
|
|
);
|
|
|
|
unsafeHtml = body.html();
|
|
unsafeHtml = unsafeHtml.replaceAll(nlRandom, "\n");
|
|
|
|
return unsafeHtml;
|
|
}
|
|
|
|
private static boolean isExternalUrl(String url) {
|
|
return url != null
|
|
&& !url.trim().toLowerCase().startsWith("data:")
|
|
&& (url.startsWith("http://") || url.startsWith("https://") || url.startsWith("//"));
|
|
}
|
|
|
|
private static boolean containsExternalUrl(String style) {
|
|
return style != null && style.toLowerCase().matches(".*url\\((?!['\"]?data:).+\\).*");
|
|
}
|
|
|
|
private static void replaceWithNotice(Element el, String tagName) {
|
|
el.replaceWith(new Element("div")
|
|
.addClass("blocked-element")
|
|
.text("该 " + tagName + " 元素可能包含危害安全的内容,已屏蔽。"));
|
|
}
|
|
}
|