序
本文主要研究一下Spring AI Alibaba的YuQueDocumentReader
YuQueDocumentReader
community/document-readers/spring-ai-alibaba-starter-document-reader-yuque/src/main/java/com/alibaba/cloud/ai/reader/yuque/YuQueDocumentReader.java
public class YuQueDocumentReader implements DocumentReader {
private final DocumentParser parser;
private final YuQueResource yuQueResource;
public YuQueDocumentReader(YuQueResource yuQueResource, DocumentParser parser) {
this.yuQueResource = yuQueResource;
this.parser = parser;
}
@Override
public List<Document> get() {
try {
List<Document> documents = parser.parse(yuQueResource.getInputStream());
String source = yuQueResource.getResourcePath();
for (Document doc : documents) {
doc.getMetadata().put(YuQueResource.SOURCE, source);
}
return documents;
}
catch (IOException ioException) {
throw new RuntimeException("Failed to load document from yuque: {}", ioException);
}
}
}
YuQueDocumentReader构造器要求输入YuQueResource、DocumentParser,其get方法通过DocumentParser解析,最后在其metadata追加一个SOURCE
YuQueResource
community/document-readers/spring-ai-alibaba-starter-document-reader-yuque/src/main/java/com/alibaba/cloud/ai/reader/yuque/YuQueResource.java
public class YuQueResource implements Resource {
private static final String BASE_URL = "https://www.yuque.com";
private static final String INFO_PATH = "/api/v2/hello";
private static final String DOC_DETAIL_PATH = "/api/v2/repos/%s/%s/docs/%s";
public static final String SOURCE = "source";
public static final String SUPPORT_TYPE = "Doc";
private final HttpClient httpClient;
private final InputStream inputStream;
private final URI uri;
private final String resourcePath;
private String groupLogin;
private String bookSlug;
private String id;
public YuQueResource(String yuQueToken, String resourcePath) {
this.resourcePath = resourcePath;
this.httpClient = HttpClient.newBuilder().version(HttpClient.Version.HTTP_2).build();
judgePathRule(resourcePath);
judgeToken(yuQueToken);
URI baseUri = URI.create(BASE_URL + DOC_DETAIL_PATH.formatted(groupLogin, bookSlug, id));
HttpRequest httpRequest = HttpRequest.newBuilder()
.header("X-Auth-Token", yuQueToken)
.uri(baseUri)
.GET()
.build();
try {
HttpResponse<String> response = this.httpClient.send(httpRequest, HttpResponse.BodyHandlers.ofString());
String body = response.body();
// Parse the JSON response using Jackson
ObjectMapper objectMapper = new ObjectMapper();
JsonNode jsonObject = objectMapper.readTree(body);
JsonNode dataObject = jsonObject.get("data");
if (dataObject == null || !dataObject.isObject()) {
throw new RuntimeException("Invalid response format: 'data' is not an object");
}
if (!Objects.equals(dataObject.get("type").asText(), SUPPORT_TYPE)) {
throw new RuntimeException("Unsupported resource type, only support " + SUPPORT_TYPE);
}
inputStream = new ByteArrayInputStream(dataObject.get("body_html").asText().getBytes());
uri = URI.create(resourcePath);
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Judge resource path rule Official online doc
* https://www.yuque.com/yuque/developer/openapi
* @param resourcePath
*/
private void judgePathRule(String resourcePath) {
// Determine if the path conforms to this format: https://xx.xxx.com/aa/bb/cc
String regex = "^https://[a-zA-Z0-9.-]+/([a-zA-Z0-9.-]+)/([a-zA-Z0-9.-]+)/([a-zA-Z0-9.-]+)$";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(resourcePath);
Assert.isTrue(matcher.matches(), "Invalid resource path");
// Extract the captured groups
this.groupLogin = matcher.group(1);
this.bookSlug = matcher.group(2);
this.id = matcher.group(3);
Assert.isTrue(StringUtils.hasText(this.groupLogin), "Invalid resource path");
Assert.isTrue(StringUtils.hasText(this.bookSlug), "Invalid resource path");
Assert.isTrue(StringUtils.hasText(this.id), "Invalid resource path");
}
/**
* judge yuQue token
* @param yuQueToken User/Team token
*/
private void judgeToken(String yuQueToken) {
URI uri = URI.create(BASE_URL + INFO_PATH);
HttpRequest httpRequest = HttpRequest.newBuilder().header("X-Auth-Token", yuQueToken).uri(uri).GET().build();
try {
HttpResponse<String> response = this.httpClient.send(httpRequest, HttpResponse.BodyHandlers.ofString());
int statusCode = response.statusCode();
Assert.isTrue(statusCode == 200, "Failed to auth YuQueToken");
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
//......
}
YuQueResource的构造器要求输入yuQueToken和resourcePath,它通过httpClient请求https://www.yuque.com/api/v2/repos/{groupLogin}/{bookSlug}/docs/{id}
,之后解析body_html到inputStream;其中groupLogin、bookSlug、id是judgePathRule通过解析resourcePath提取出来
示例
community/document-readers/spring-ai-alibaba-starter-document-reader-yuque/src/test/java/com/alibaba/cloud/ai/reader/yuque/YuQueDocumentLoaderIT.java
@EnabledIfEnvironmentVariable(named = "YUQUE_TOKEN", matches = ".+")
@EnabledIfEnvironmentVariable(named = "YUQUE_RESOURCE_PATH", matches = ".+")
class YuQueDocumentLoaderIT {
private static final String YU_QUE_TOKEN = System.getenv("YUQUE_TOKEN");
private static final String RESOURCE_PATH = System.getenv("YUQUE_RESOURCE_PATH");
YuQueDocumentReader reader;
YuQueResource source;
static {
if (YU_QUE_TOKEN == null || RESOURCE_PATH == null) {
System.out
.println("YUQUE_TOKEN or YUQUE_RESOURCE_PATH environment variable is not set. Tests will be skipped.");
}
}
@BeforeEach
public void beforeEach() {
// Skip test if environment variables are not set
Assumptions.assumeTrue(YU_QUE_TOKEN != null && !YU_QUE_TOKEN.isEmpty(),
"Skipping test because YUQUE_TOKEN is not set");
Assumptions.assumeTrue(RESOURCE_PATH != null && !RESOURCE_PATH.isEmpty(),
"Skipping test because YUQUE_RESOURCE_PATH is not set");
source = YuQueResource.builder().yuQueToken(YU_QUE_TOKEN).resourcePath(RESOURCE_PATH).build();
reader = new YuQueDocumentReader(source, new TikaDocumentParser());
}
@Test
public void should_load_file() {
// Skip test if reader is not initialized
Assumptions.assumeTrue(reader != null, "Skipping test because reader is not initialized");
List<Document> document = reader.get();
String content = document.get(0).getText();
System.out.println(content);
}
}
小结
spring-ai-alibaba-starter-document-reader-yuque提供了YuQueDocumentReader,它通过YuQueResource去请求资源,再通过DocumentParser解析(比如TikaDocumentParser
)为Document
,最后追加一个SOURCE的metadata。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。