获取指标说明的能力

This commit is contained in:
2025-05-21 15:51:44 +08:00
parent 06c351a956
commit 1f329e3b2a
131 changed files with 4461 additions and 3126 deletions

View File

@@ -0,0 +1,134 @@
package quant.rich.emoney.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import java.util.Iterator;
import java.util.Random;
public class HtmlSanitizer {
public static final Random random = new Random();
public static String sanitize(String unsafeHtml) {
String nlRandom =
new StringBuilder("__NL").append(random.nextInt()).append("__").toString();
unsafeHtml = unsafeHtml.replaceAll("\n", nlRandom);
Document doc = Jsoup.parseBodyFragment(unsafeHtml);
Element body = doc.body();
NodeTraversor.traverse(new NodeVisitor() {
public void head(Node node, int depth) {
if (!(node instanceof Element)) return;
Element el = (Element) node;
String tag = el.tagName().toLowerCase();
// 清洗 <link rel="stylesheet" href="...">
if (tag.equals("link")) {
String href = el.attr("href");
if (isExternalUrl(href)) {
replaceWithNotice(el, "link[href]");
return;
}
}
// 清洗危险结构元素
switch (tag) {
case "script":
case "iframe":
case "object":
case "embed":
case "video":
case "audio":
case "source":
case "meta":
case "fencedframe":
replaceWithNotice(el, tag);
return;
}
// 清除 on* 属性
Iterator<Attribute> it = el.attributes().iterator();
while (it.hasNext()) {
Attribute attr = it.next();
if (attr.getKey().toLowerCase().startsWith("on")) {
it.remove();
}
}
// 清除 src 外链
if (el.hasAttr("src")) {
String src = el.attr("src");
if (isExternalUrl(src)) {
el.removeAttr("src");
el.appendChild(new TextNode("[已移除外链资源]"));
}
}
// 清除 style="...url(...)" 中的非 base64 URL
if (el.hasAttr("style")) {
String style = el.attr("style");
if (containsExternalUrl(style)) {
style = style.replaceAll("(?i)url\\((?!['\"]?data:)[^)]*\\)", "");
el.attr("style", style + " /* 外链背景图已清除 */");
}
}
// a 标签添加 rel="noreferrer"
if (tag.equals("a")) {
String rel = el.attr("rel");
if (!rel.toLowerCase().contains("noreferrer")) {
el.attr("rel", "noreferrer");
}
}
// style 标签内容清理 url(...)
if (tag.equals("style")) {
for (Node child : el.childNodes()) {
if (child instanceof TextNode) {
TextNode text = (TextNode) child;
String content = text.getWholeText();
// 移除非 base64 的 url(...)
String cleaned = content.replaceAll("(?i)url\\((?!['\"]?data:)[^)]*\\)", "/* 外链已清除 */");
text.text(cleaned);
}
}
}
}
public void tail(Node node, int depth) {
// nothing
}
}, body);
// SVG、MathML 类危险标签
doc.select("svg, math, foreignObject").forEach(el ->
replaceWithNotice(el, el.tagName())
);
unsafeHtml = body.html();
unsafeHtml = unsafeHtml.replaceAll(nlRandom, "\n");
return unsafeHtml;
}
private static boolean isExternalUrl(String url) {
return url != null
&& !url.trim().toLowerCase().startsWith("data:")
&& (url.startsWith("http://") || url.startsWith("https://") || url.startsWith("//"));
}
private static boolean containsExternalUrl(String style) {
return style != null && style.toLowerCase().matches(".*url\\((?!['\"]?data:).+\\).*");
}
private static void replaceWithNotice(Element el, String tagName) {
el.replaceWith(new Element("div")
.addClass("blocked-element")
.text("" + tagName + " 元素可能包含危害安全的内容,已屏蔽。"));
}
}