获取指标说明的能力

This commit is contained in:
2025-05-21 15:51:44 +08:00
parent 06c351a956
commit 1f329e3b2a
131 changed files with 4461 additions and 3126 deletions

View File

@@ -0,0 +1,285 @@
package quant.rich;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.graalvm.polyglot.Context;
import org.graalvm.polyglot.Value;
import org.junit.jupiter.api.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import jodd.jerry.Jerry;
import lombok.extern.slf4j.Slf4j;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import quant.rich.emoney.EmoneyAutoApplication;
import quant.rich.emoney.client.EmoneyClient;
import quant.rich.emoney.client.OkHttpClientProvider;
import quant.rich.emoney.entity.config.EmoneyRequestConfig;
import quant.rich.emoney.pojo.dto.NonParamsIndexDetail;
import quant.rich.emoney.pojo.dto.NonParamsIndexDetail.NonParamsIndexDetailData;
@SpringBootTest
@ContextConfiguration(classes = EmoneyAutoApplication.class)
@RunWith(SpringJUnit4ClassRunner.class)
@Slf4j
public class EmoneyIndexScraper {
private static final ObjectMapper MAPPER = new ObjectMapper();
@Autowired
EmoneyRequestConfig emoneyRequestConfig;
static {
MAPPER.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
}
public String buildUrl(Serializable indexCode) {
StringBuilder urlBuilder = new StringBuilder();
urlBuilder.append("https://appstatic.emoney.cn/html/emapp/stock/note/?name=");
urlBuilder.append(indexCode.toString());
urlBuilder.append("&emoneyScaleType=0&emoneyLandMode=0&token=");
urlBuilder.append(emoneyRequestConfig.getAuthorization());
return urlBuilder.toString();
}
@Test
void test() {
EmoneyClient.relogin();
String url = buildUrl(10002800);
// 1.1 按照顺序放置 Header
Request request = new Request.Builder()
.url(url)
.header("Host", "appstatic.emoney.cn")
.header("Connection", "keep-alive")
.header("Upgrade-Insecure-Requests", "1")
.header("User-Agent", emoneyRequestConfig.getWebviewUserAgent())
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
.header("X-Request-With", "cn.emoney.emstock")
.header("Sec-Fetch-Site", "none")
.header("Sec-Fetch-Mode", "navigate")
.header("Sec-Fetch-User", "?1")
.header("Sec-Fetch-Dest", "document")
.header("Accept-Encoding", "gzip, deflate")
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
.build();
// 1.2 请求
OkHttpClient client = OkHttpClientProvider.getInstance();
List<String> scripts = new ArrayList<>();
try (Response response = client.newCall(request).execute()) {
if (!response.isSuccessful()) {
log.error("1.2 请求指标说明基本页面失败response.isSuccessful() is false");
return;
}
String responseBody = response.body().string();
Jerry jerryDoc = Jerry.of(responseBody);
jerryDoc.s("script[src]").forEach(el -> {
String absoluteURI = resolveUrl(url, el.attr("src"));
log.info("script uri: {}", absoluteURI);
if (absoluteURI != null) {
scripts.add(absoluteURI);
}
});
} catch (IOException e) {
// TODO Auto-generated catch block
log.error("1.2 请求指标说明基本页面失败IOException", e);
e.printStackTrace();
return;
}
if (scripts.size() == 0) {
log.error("未能获取基本页面内脚本链接");
return;
}
// 2. 加载 scripts试图匹配包含在 js 中的指标说明
Pattern p = Pattern.compile("\\[(\\{(id:\\d+,?|data:\\[\\{(title:\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?|items:\\[(\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?)+],?|image:\".*?\",?)+\\}+\\],?|name:\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?|nameCode:\"\\d+\",?)+\\},?)+\\]", Pattern.DOTALL);
List<String> matchGroups = new ArrayList<>();
Request.Builder scriptBuilder = new Request.Builder()
.header("Host", "appstatic.emoney.cn")
.header("Connection", "keep-alive")
.header("User-Agent", emoneyRequestConfig.getWebviewUserAgent())
.header("Accept", "*/*")
.header("X-Request-With", "cn.emoney.emstock")
.header("Sec-Fetch-Site", "same-origin")
.header("Sec-Fetch-Mode", "no-cors")
.header("Sec-Fetch-Dest", "script")
.header("Accept-Encoding", "gzip, deflate")
.header("Referer", url)
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
for (String scriptUrl : scripts) {
Request scriptRequest = scriptBuilder.url(scriptUrl).build();
try (Response response = client.newCall(scriptRequest).execute()) {
if (!response.isSuccessful()) {
log.warn("2. 读取页面内 script 请求失败, response.isSuccessful() is false, scriptUrl: {}", scriptUrl);
return;
}
String responseBody = response.body().string();
// 尝试匹配
Matcher m = p.matcher(responseBody);
while (m.find()) {
String find = m.group();
Optional<String> jsonStringify = stringifyJsArray(find);
jsonStringify.ifPresent(jsonString -> matchGroups.add(jsonString));
}
} catch (IOException e) {
// TODO Auto-generated catch block
log.warn("1.2 请求失败", e);
e.printStackTrace();
return;
}
}
// 将每个 jsonString 转换为 jsonArray进一步转换成 IndexDetail
List<NonParamsIndexDetail> valid = new ArrayList<>();
List<ArrayNode> arrayNodes = new ArrayList<>();
for (String jsonString : matchGroups) {
try {
JsonNode root = MAPPER.readTree(jsonString);
if (root.isArray()) {
ArrayNode array = (ArrayNode) root;
for (JsonNode obj : array) {
if (obj.isObject() && matches(obj)) {
NonParamsIndexDetail detail = MAPPER.treeToValue(obj, NonParamsIndexDetail.class);
valid.add(detail);
}
}
}
} catch (Exception ignored) {
log.error("试图转换匹配项出错", ignored);
}
}
// 直接保存在本地
// 遍历 valid 内合法的 object, 获取 image
Pattern numericPattern = Pattern.compile("^\\d+$");
for (NonParamsIndexDetail detail : valid) {
// 判断 nameCode 是否是合法的代码
if (!numericPattern.matcher(detail.getNameCode()).matches()) {
continue;
}
Request.Builder imageBuilder = new Request.Builder()
.header("Host", "appstatic.emoney.cn")
.header("Connection", "keep-alive")
.header("User-Agent", emoneyRequestConfig.getWebviewUserAgent())
.header("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
.header("X-Request-With", "cn.emoney.emstock")
.header("Sec-Fetch-Site", "same-origin")
.header("Sec-Fetch-Mode", "no-cors")
.header("Sec-Fetch-Dest", "image")
.header("Accept-Encoding", "gzip, deflate")
.header("Referer", buildUrl(detail.getNameCode()))
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
List<NonParamsIndexDetailData> datas = detail.getData();
for (NonParamsIndexDetailData data : datas) {
if (data.getImage() != null) {
String image = data.getImage();
if (image.startsWith("data:image/") && image.contains(";base64,")) {
continue;
}
else {
image = resolveUrl(url, image);
Request imageRequest = imageBuilder.url(image).build();
try (Response response = client.newCall(imageRequest).execute()) {
if (!response.isSuccessful()) {
log.warn("获取详情图片请求失败, response.isSuccessful() is false, imageUrl: {}", image);
continue;
}
byte[] bytes = response.body().bytes();
String contentType = response.body().contentType().toString();
String base64 = Base64.getEncoder().encodeToString(bytes);
data.setImage("data:" + contentType + ";base64," + base64);
log.debug("获取图片成功");
} catch (IOException e) {
// TODO Auto-generated catch block
log.warn("获取详情图片请求失败", e);
e.printStackTrace();
return;
}
}
}
}
try {
Files.writeString(
Path.of("./conf/extra/nonParamsIndexDetail." + detail.getNameCode() + ".json"),
MAPPER.valueToTree(detail).toPrettyString());
} catch (IllegalArgumentException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return;
}
/**
* 判断某 jsonNode 是否为合法指标信息
* @param obj
* @return
*/
public static boolean matches(JsonNode obj) {
return obj.has("name") && obj.get("name").isTextual() &&
obj.has("nameCode") && obj.get("nameCode").isTextual() &&
obj.has("data") && obj.get("data").isArray()
;
}
/**
* 使用 GraalVM JS 尝试 JSON.stringify
* @param jsArrayText
* @return
*/
public static Optional<String> stringifyJsArray(String jsArrayText) {
try (Context context = Context.create("js")) {
Value result = context.eval("js", "JSON.stringify(" + jsArrayText + ")");
return result.isString() ? Optional.of(result.asString()) : Optional.empty();
} catch (Exception e) {
log.warn("Failed to call JSON.stringify on {}", jsArrayText, e);
return Optional.empty();
}
}
public static String resolveUrl(String pageUrl, String scriptSrc) {
try {
URI base = URI.create(pageUrl);
return base.resolve(scriptSrc).toString();
} catch (Exception e) {
log.info("转换 URI 错误pageUrl: {}, scriptSrc: {} {}", pageUrl, scriptSrc, e);
return null;
}
}
}