285 lines
12 KiB
Java
285 lines
12 KiB
Java
package quant.rich;
|
||
|
||
import java.io.IOException;
|
||
import java.io.Serializable;
|
||
import java.net.URI;
|
||
import java.nio.file.Files;
|
||
import java.nio.file.Path;
|
||
import java.util.ArrayList;
|
||
import java.util.Base64;
|
||
import java.util.List;
|
||
import java.util.Optional;
|
||
import java.util.regex.Matcher;
|
||
import java.util.regex.Pattern;
|
||
|
||
import org.graalvm.polyglot.Context;
|
||
import org.graalvm.polyglot.Value;
|
||
import org.junit.jupiter.api.Test;
|
||
import org.junit.runner.RunWith;
|
||
import org.springframework.beans.factory.annotation.Autowired;
|
||
import org.springframework.boot.test.context.SpringBootTest;
|
||
import org.springframework.test.context.ContextConfiguration;
|
||
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
|
||
|
||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||
import com.fasterxml.jackson.databind.JsonNode;
|
||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||
import com.fasterxml.jackson.databind.node.ArrayNode;
|
||
|
||
import jodd.jerry.Jerry;
|
||
import lombok.extern.slf4j.Slf4j;
|
||
import okhttp3.OkHttpClient;
|
||
import okhttp3.Request;
|
||
import okhttp3.Response;
|
||
import quant.rich.emoney.EmoneyAutoApplication;
|
||
import quant.rich.emoney.client.EmoneyClient;
|
||
import quant.rich.emoney.client.OkHttpClientProvider;
|
||
import quant.rich.emoney.pojo.dto.NonParamsIndexDetail;
|
||
import quant.rich.emoney.pojo.dto.NonParamsIndexDetail.NonParamsIndexDetailData;
|
||
import quant.rich.emoney.service.sqlite.RequestInfoService;
|
||
|
||
@SpringBootTest
|
||
@ContextConfiguration(classes = EmoneyAutoApplication.class)
|
||
@RunWith(SpringJUnit4ClassRunner.class)
|
||
@Slf4j
|
||
public class EmoneyIndexScraper {
|
||
|
||
private static final ObjectMapper MAPPER = new ObjectMapper();
|
||
|
||
@Autowired
|
||
RequestInfoService requestInfoService;
|
||
|
||
static {
|
||
MAPPER.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||
}
|
||
|
||
public String buildUrl(Serializable indexCode) {
|
||
StringBuilder urlBuilder = new StringBuilder();
|
||
urlBuilder.append("https://appstatic.emoney.cn/html/emapp/stock/note/?name=");
|
||
urlBuilder.append(indexCode.toString());
|
||
urlBuilder.append("&emoneyScaleType=0&emoneyLandMode=0&token=");
|
||
urlBuilder.append(requestInfoService.getDefaultRequestInfo().getAuthorization());
|
||
return urlBuilder.toString();
|
||
}
|
||
|
||
@Test
|
||
void test() {
|
||
|
||
EmoneyClient.relogin();
|
||
|
||
String url = buildUrl(10002800);
|
||
|
||
// 1.1 按照顺序放置 Header
|
||
Request request = new Request.Builder()
|
||
.url(url)
|
||
.header("Host", "appstatic.emoney.cn")
|
||
.header("Connection", "keep-alive")
|
||
.header("Upgrade-Insecure-Requests", "1")
|
||
.header("User-Agent", requestInfoService.getDefaultRequestInfo().getWebviewUserAgent())
|
||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
||
.header("X-Request-With", "cn.emoney.emstock")
|
||
.header("Sec-Fetch-Site", "none")
|
||
.header("Sec-Fetch-Mode", "navigate")
|
||
.header("Sec-Fetch-User", "?1")
|
||
.header("Sec-Fetch-Dest", "document")
|
||
.header("Accept-Encoding", "gzip, deflate")
|
||
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
|
||
.build();
|
||
|
||
// 1.2 请求
|
||
OkHttpClient client = OkHttpClientProvider.getInstance();
|
||
List<String> scripts = new ArrayList<>();
|
||
try (Response response = client.newCall(request).execute()) {
|
||
if (!response.isSuccessful()) {
|
||
log.error("1.2 请求指标说明基本页面失败:response.isSuccessful() is false");
|
||
return;
|
||
}
|
||
|
||
String responseBody = response.body().string();
|
||
Jerry jerryDoc = Jerry.of(responseBody);
|
||
jerryDoc.s("script[src]").forEach(el -> {
|
||
|
||
String absoluteURI = resolveUrl(url, el.attr("src"));
|
||
log.info("script uri: {}", absoluteURI);
|
||
if (absoluteURI != null) {
|
||
scripts.add(absoluteURI);
|
||
}
|
||
});
|
||
|
||
} catch (IOException e) {
|
||
// TODO Auto-generated catch block
|
||
log.error("1.2 请求指标说明基本页面失败,IOException", e);
|
||
e.printStackTrace();
|
||
return;
|
||
}
|
||
|
||
if (scripts.size() == 0) {
|
||
log.error("未能获取基本页面内脚本链接");
|
||
return;
|
||
}
|
||
|
||
// 2. 加载 scripts,试图匹配包含在 js 中的指标说明
|
||
Pattern p = Pattern.compile("\\[(\\{(id:\\d+,?|data:\\[\\{(title:\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?|items:\\[(\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?)+],?|image:\".*?\",?)+\\}+\\],?|name:\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*?\",?|nameCode:\"\\d+\",?)+\\},?)+\\]", Pattern.DOTALL);
|
||
List<String> matchGroups = new ArrayList<>();
|
||
Request.Builder scriptBuilder = new Request.Builder()
|
||
.header("Host", "appstatic.emoney.cn")
|
||
.header("Connection", "keep-alive")
|
||
.header("User-Agent", requestInfoService.getDefaultRequestInfo().getWebviewUserAgent())
|
||
.header("Accept", "*/*")
|
||
.header("X-Request-With", "cn.emoney.emstock")
|
||
.header("Sec-Fetch-Site", "same-origin")
|
||
.header("Sec-Fetch-Mode", "no-cors")
|
||
.header("Sec-Fetch-Dest", "script")
|
||
.header("Accept-Encoding", "gzip, deflate")
|
||
.header("Referer", url)
|
||
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
|
||
for (String scriptUrl : scripts) {
|
||
Request scriptRequest = scriptBuilder.url(scriptUrl).build();
|
||
try (Response response = client.newCall(scriptRequest).execute()) {
|
||
if (!response.isSuccessful()) {
|
||
log.warn("2. 读取页面内 script 请求失败, response.isSuccessful() is false, scriptUrl: {}", scriptUrl);
|
||
return;
|
||
}
|
||
|
||
String responseBody = response.body().string();
|
||
// 尝试匹配
|
||
Matcher m = p.matcher(responseBody);
|
||
while (m.find()) {
|
||
String find = m.group();
|
||
Optional<String> jsonStringify = stringifyJsArray(find);
|
||
jsonStringify.ifPresent(jsonString -> matchGroups.add(jsonString));
|
||
}
|
||
|
||
} catch (IOException e) {
|
||
// TODO Auto-generated catch block
|
||
log.warn("1.2 请求失败", e);
|
||
e.printStackTrace();
|
||
return;
|
||
}
|
||
}
|
||
|
||
// 将每个 jsonString 转换为 jsonArray,进一步转换成 IndexDetail
|
||
List<NonParamsIndexDetail> valid = new ArrayList<>();
|
||
for (String jsonString : matchGroups) {
|
||
try {
|
||
JsonNode root = MAPPER.readTree(jsonString);
|
||
if (root.isArray()) {
|
||
ArrayNode array = (ArrayNode) root;
|
||
for (JsonNode obj : array) {
|
||
if (obj.isObject() && matches(obj)) {
|
||
NonParamsIndexDetail detail = MAPPER.treeToValue(obj, NonParamsIndexDetail.class);
|
||
valid.add(detail);
|
||
}
|
||
}
|
||
}
|
||
} catch (Exception ignored) {
|
||
log.error("试图转换匹配项出错", ignored);
|
||
}
|
||
}
|
||
|
||
// 直接保存在本地
|
||
// 遍历 valid 内合法的 object, 获取 image
|
||
Pattern numericPattern = Pattern.compile("^\\d+$");
|
||
for (NonParamsIndexDetail detail : valid) {
|
||
// 判断 nameCode 是否是合法的代码
|
||
if (!numericPattern.matcher(detail.getNameCode()).matches()) {
|
||
continue;
|
||
}
|
||
Request.Builder imageBuilder = new Request.Builder()
|
||
.header("Host", "appstatic.emoney.cn")
|
||
.header("Connection", "keep-alive")
|
||
.header("User-Agent", requestInfoService.getDefaultRequestInfo().getWebviewUserAgent())
|
||
.header("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
|
||
.header("X-Request-With", "cn.emoney.emstock")
|
||
.header("Sec-Fetch-Site", "same-origin")
|
||
.header("Sec-Fetch-Mode", "no-cors")
|
||
.header("Sec-Fetch-Dest", "image")
|
||
.header("Accept-Encoding", "gzip, deflate")
|
||
.header("Referer", buildUrl(detail.getNameCode()))
|
||
.header("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
|
||
|
||
List<NonParamsIndexDetailData> datas = detail.getData();
|
||
for (NonParamsIndexDetailData data : datas) {
|
||
if (data.getImage() != null) {
|
||
String image = data.getImage();
|
||
if (image.startsWith("data:image/") && image.contains(";base64,")) {
|
||
continue;
|
||
}
|
||
else {
|
||
image = resolveUrl(url, image);
|
||
Request imageRequest = imageBuilder.url(image).build();
|
||
try (Response response = client.newCall(imageRequest).execute()) {
|
||
if (!response.isSuccessful()) {
|
||
log.warn("获取详情图片请求失败, response.isSuccessful() is false, imageUrl: {}", image);
|
||
continue;
|
||
}
|
||
|
||
byte[] bytes = response.body().bytes();
|
||
String contentType = response.body().contentType().toString();
|
||
String base64 = Base64.getEncoder().encodeToString(bytes);
|
||
data.setImage("data:" + contentType + ";base64," + base64);
|
||
log.debug("获取图片成功");
|
||
|
||
} catch (IOException e) {
|
||
// TODO Auto-generated catch block
|
||
log.warn("获取详情图片请求失败", e);
|
||
e.printStackTrace();
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
try {
|
||
Files.writeString(
|
||
Path.of("./resources/conf/extra/nonParamsIndexDetail." + detail.getNameCode() + ".json"),
|
||
MAPPER.valueToTree(detail).toPrettyString());
|
||
} catch (IllegalArgumentException | IOException e) {
|
||
// TODO Auto-generated catch block
|
||
e.printStackTrace();
|
||
}
|
||
|
||
}
|
||
return;
|
||
}
|
||
|
||
/**
|
||
* 判断某 jsonNode 是否为合法指标信息
|
||
* @param obj
|
||
* @return
|
||
*/
|
||
public static boolean matches(JsonNode obj) {
|
||
return obj.has("name") && obj.get("name").isTextual() &&
|
||
obj.has("nameCode") && obj.get("nameCode").isTextual() &&
|
||
obj.has("data") && obj.get("data").isArray()
|
||
;
|
||
}
|
||
|
||
|
||
/**
|
||
* 使用 GraalVM JS 尝试 JSON.stringify
|
||
* @param jsArrayText
|
||
* @return
|
||
*/
|
||
public static Optional<String> stringifyJsArray(String jsArrayText) {
|
||
try (Context context = Context.create("js")) {
|
||
Value result = context.eval("js", "JSON.stringify(" + jsArrayText + ")");
|
||
return result.isString() ? Optional.of(result.asString()) : Optional.empty();
|
||
} catch (Exception e) {
|
||
log.warn("Failed to call JSON.stringify on {}", jsArrayText, e);
|
||
return Optional.empty();
|
||
}
|
||
}
|
||
|
||
public static String resolveUrl(String pageUrl, String scriptSrc) {
|
||
try {
|
||
URI base = URI.create(pageUrl);
|
||
return base.resolve(scriptSrc).toString();
|
||
} catch (Exception e) {
|
||
log.info("转换 URI 错误,pageUrl: {}, scriptSrc: {} {}", pageUrl, scriptSrc, e);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
}
|