Files
emo-grab/src/test/java/quant/rich/EmoneyIndexScraper.java

285 lines
12 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package quant.rich;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.graalvm.polyglot.Context;
import org.graalvm.polyglot.Value;
import org.junit.jupiter.api.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import jodd.jerry.Jerry;
import lombok.extern.slf4j.Slf4j;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import quant.rich.emoney.EmoneyAutoApplication;
import quant.rich.emoney.client.EmoneyClient;
import quant.rich.emoney.client.OkHttpClientProvider;
import quant.rich.emoney.pojo.dto.NonParamsIndexDetail;
import quant.rich.emoney.pojo.dto.NonParamsIndexDetail.NonParamsIndexDetailData;
import quant.rich.emoney.service.sqlite.RequestInfoService;
@SpringBootTest
@ContextConfiguration(classes = EmoneyAutoApplication.class)
@RunWith(SpringJUnit4ClassRunner.class)
@Slf4j
public class EmoneyIndexScraper {
private static final ObjectMapper MAPPER = new ObjectMapper();
@Autowired
RequestInfoService requestInfoService;
static {
MAPPER.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
}
public String buildUrl(Serializable indexCode) {
StringBuilder urlBuilder = new StringBuilder();
urlBuilder.append("https://appstatic.emoney.cn/html/emapp/stock/note/?name=");
urlBuilder.append(indexCode.toString());
urlBuilder.append("&emoneyScaleType=0&emoneyLandMode=0&token=");
urlBuilder.append(requestInfoService.getDefaultRequestInfo().getAuthorization());
return urlBuilder.toString();
}
@Test
void test() {
EmoneyClient.relogin();
String url = buildUrl(10002800);
// 1.1 按照顺序放置 Header
Request request = new Request.Builder()
.url(url)
.header("Host", "appstatic.emoney.cn")
.header("Connection", "keep-alive")
.header("Upgrade-Insecure-Requests", "1")
.header("User-Agent", requestInfoService.getDefaultRequestInfo().getWebviewUserAgent())
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
.header("X-Request-With", "cn.emoney.emstock")
.header("Sec-Fetch-Site", "none")
.header("Sec-Fetch-Mode", "navigate")
.header("Sec-Fetch-User", "?1")
.header("Sec-Fetch-Dest", "document")
.header("Accept-Encoding", "gzip, deflate")
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
.build();
// 1.2 请求
OkHttpClient client = OkHttpClientProvider.getInstance();
List<String> scripts = new ArrayList<>();
try (Response response = client.newCall(request).execute()) {
if (!response.isSuccessful()) {
log.error("1.2 请求指标说明基本页面失败response.isSuccessful() is false");
return;
}
String responseBody = response.body().string();
Jerry jerryDoc = Jerry.of(responseBody);
jerryDoc.s("script[src]").forEach(el -> {
String absoluteURI = resolveUrl(url, el.attr("src"));
log.info("script uri: {}", absoluteURI);
if (absoluteURI != null) {
scripts.add(absoluteURI);
}
});
} catch (IOException e) {
// TODO Auto-generated catch block
log.error("1.2 请求指标说明基本页面失败IOException", e);
e.printStackTrace();
return;
}
if (scripts.size() == 0) {
log.error("未能获取基本页面内脚本链接");
return;
}
// 2. 加载 scripts试图匹配包含在 js 中的指标说明
Pattern p = Pattern.compile("\\[(\\{(id:\\d+,?|data:\\[\\{(title:\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?|items:\\[(\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?)+],?|image:\".*?\",?)+\\}+\\],?|name:\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?|nameCode:\"\\d+\",?)+\\},?)+\\]", Pattern.DOTALL);
List<String> matchGroups = new ArrayList<>();
Request.Builder scriptBuilder = new Request.Builder()
.header("Host", "appstatic.emoney.cn")
.header("Connection", "keep-alive")
.header("User-Agent", requestInfoService.getDefaultRequestInfo().getWebviewUserAgent())
.header("Accept", "*/*")
.header("X-Request-With", "cn.emoney.emstock")
.header("Sec-Fetch-Site", "same-origin")
.header("Sec-Fetch-Mode", "no-cors")
.header("Sec-Fetch-Dest", "script")
.header("Accept-Encoding", "gzip, deflate")
.header("Referer", url)
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
for (String scriptUrl : scripts) {
Request scriptRequest = scriptBuilder.url(scriptUrl).build();
try (Response response = client.newCall(scriptRequest).execute()) {
if (!response.isSuccessful()) {
log.warn("2. 读取页面内 script 请求失败, response.isSuccessful() is false, scriptUrl: {}", scriptUrl);
return;
}
String responseBody = response.body().string();
// 尝试匹配
Matcher m = p.matcher(responseBody);
while (m.find()) {
String find = m.group();
Optional<String> jsonStringify = stringifyJsArray(find);
jsonStringify.ifPresent(jsonString -> matchGroups.add(jsonString));
}
} catch (IOException e) {
// TODO Auto-generated catch block
log.warn("1.2 请求失败", e);
e.printStackTrace();
return;
}
}
// 将每个 jsonString 转换为 jsonArray进一步转换成 IndexDetail
List<NonParamsIndexDetail> valid = new ArrayList<>();
for (String jsonString : matchGroups) {
try {
JsonNode root = MAPPER.readTree(jsonString);
if (root.isArray()) {
ArrayNode array = (ArrayNode) root;
for (JsonNode obj : array) {
if (obj.isObject() && matches(obj)) {
NonParamsIndexDetail detail = MAPPER.treeToValue(obj, NonParamsIndexDetail.class);
valid.add(detail);
}
}
}
} catch (Exception ignored) {
log.error("试图转换匹配项出错", ignored);
}
}
// 直接保存在本地
// 遍历 valid 内合法的 object, 获取 image
Pattern numericPattern = Pattern.compile("^\\d+$");
for (NonParamsIndexDetail detail : valid) {
// 判断 nameCode 是否是合法的代码
if (!numericPattern.matcher(detail.getNameCode()).matches()) {
continue;
}
Request.Builder imageBuilder = new Request.Builder()
.header("Host", "appstatic.emoney.cn")
.header("Connection", "keep-alive")
.header("User-Agent", requestInfoService.getDefaultRequestInfo().getWebviewUserAgent())
.header("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
.header("X-Request-With", "cn.emoney.emstock")
.header("Sec-Fetch-Site", "same-origin")
.header("Sec-Fetch-Mode", "no-cors")
.header("Sec-Fetch-Dest", "image")
.header("Accept-Encoding", "gzip, deflate")
.header("Referer", buildUrl(detail.getNameCode()))
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
List<NonParamsIndexDetailData> datas = detail.getData();
for (NonParamsIndexDetailData data : datas) {
if (data.getImage() != null) {
String image = data.getImage();
if (image.startsWith("data:image/") && image.contains(";base64,")) {
continue;
}
else {
image = resolveUrl(url, image);
Request imageRequest = imageBuilder.url(image).build();
try (Response response = client.newCall(imageRequest).execute()) {
if (!response.isSuccessful()) {
log.warn("获取详情图片请求失败, response.isSuccessful() is false, imageUrl: {}", image);
continue;
}
byte[] bytes = response.body().bytes();
String contentType = response.body().contentType().toString();
String base64 = Base64.getEncoder().encodeToString(bytes);
data.setImage("data:" + contentType + ";base64," + base64);
log.debug("获取图片成功");
} catch (IOException e) {
// TODO Auto-generated catch block
log.warn("获取详情图片请求失败", e);
e.printStackTrace();
return;
}
}
}
}
try {
Files.writeString(
Path.of("./resources/conf/extra/nonParamsIndexDetail." + detail.getNameCode() + ".json"),
MAPPER.valueToTree(detail).toPrettyString());
} catch (IllegalArgumentException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return;
}
/**
* 判断某 jsonNode 是否为合法指标信息
* @param obj
* @return
*/
public static boolean matches(JsonNode obj) {
return obj.has("name") && obj.get("name").isTextual() &&
obj.has("nameCode") && obj.get("nameCode").isTextual() &&
obj.has("data") && obj.get("data").isArray()
;
}
/**
* 使用 GraalVM JS 尝试 JSON.stringify
* @param jsArrayText
* @return
*/
public static Optional<String> stringifyJsArray(String jsArrayText) {
try (Context context = Context.create("js")) {
Value result = context.eval("js", "JSON.stringify(" + jsArrayText + ")");
return result.isString() ? Optional.of(result.asString()) : Optional.empty();
} catch (Exception e) {
log.warn("Failed to call JSON.stringify on {}", jsArrayText, e);
return Optional.empty();
}
}
public static String resolveUrl(String pageUrl, String scriptSrc) {
try {
URI base = URI.create(pageUrl);
return base.resolve(scriptSrc).toString();
} catch (Exception e) {
log.info("转换 URI 错误pageUrl: {}, scriptSrc: {} {}", pageUrl, scriptSrc, e);
return null;
}
}
}