序
本文主要研究一下Spring AI Alibaba的OneNoteDocumentReader
OneNoteDocumentReader
community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReader.java
public class OneNoteDocumentReader implements DocumentReader {
public static final String MICROSOFT_GRAPH_BASE_URL = "https://graph.microsoft.com/v1.0";
public static final String NOTEBOOK_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id";
public static final String SECTION_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id";
private static final Logger log = LoggerFactory.getLogger(OneNoteDocumentReader.class);
private final OneNoteResource oneNoteResource;
private final HttpClient client;
private final String accessToken;
public OneNoteDocumentReader(String accessToken, OneNoteResource oneNoteResource) {
this.accessToken = accessToken;
this.oneNoteResource = oneNoteResource;
this.client = HttpClient.newBuilder().version(HttpClient.Version.HTTP_2).build();
}
/**
* Retrieves the content of a OneNote notebook by querying the Microsoft Graph API.
*/
private List<String> getNoteBookContent(String accessToken, String notebookId) {
// Build the URI for fetching pages from the notebook
String uri = MICROSOFT_GRAPH_BASE_URL + NOTEBOOK_ID_FILTER_PREFIX + "+eq+" + "'" + notebookId + "'";
// Get the page IDs from the notebook by querying the API
List<String> pageIdsFromNotebook = getOneNotePageIdsByURI(accessToken, uri);
// Fetch the content for each page by its ID
return pageIdsFromNotebook.stream().map(id -> getPageContent(accessToken, id)).toList();
}
/**
* Retrieves the content of a OneNote section by querying the Microsoft Graph API.
*/
private List<String> getSectionContent(String accessToken, String sectionId) {
// Build the URI for fetching pages from the section
String uri = MICROSOFT_GRAPH_BASE_URL + SECTION_ID_FILTER_PREFIX + "+eq+" + "'" + sectionId + "'";
// Get the page IDs from the notebook by querying the API
List<String> pageIdsBySection = getOneNotePageIdsByURI(accessToken, uri);
// Fetch the content for each page by its ID
return pageIdsBySection.stream().map(id -> getPageContent(accessToken, id)).toList();
}
private List<String> getOneNotePageIdsByURI(String accessToken, String uri) {
HttpRequest request = HttpRequest.newBuilder()
.header("Authorization", accessToken)
.header("Content-Type", "application/json")
.uri(URI.create(uri))
.GET()
.build();
try {
HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());
Assert.isTrue(response.statusCode() == 200, "Failed to fetch pages information");
// Parse JSON response and extract page IDs
return parsePageIdsFromJson(response.body());
}
catch (Exception e) {
throw new RuntimeException("Failed to get pages id", e);
}
}
/**
* Parses the JSON response and extracts page IDs
*/
private List<String> parsePageIdsFromJson(String jsonResponse) {
JsonObject rootObject = JsonParser.parseString(jsonResponse).getAsJsonObject();
JsonArray valueArray = rootObject.getAsJsonArray("value");
return valueArray.asList()
.stream()
.map(jsonElement -> jsonElement.getAsJsonObject().get("id").getAsString())
.toList();
}
/**
* Retrieves the content of a specific OneNote page by querying the Microsoft Graph
* API.
*/
private String getPageContent(String accessToken, String pageId) {
URI uri = URI.create(MICROSOFT_GRAPH_BASE_URL + "/me/onenote/pages/" + pageId + "/content");
HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).uri(uri).GET().build();
try {
HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());
Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");
return parseHtmlContent(response.body());
}
catch (Exception e) {
log.warn("Failed to get page content with token: {}, pageId: {}, {}", accessToken, pageId, e.getMessage(),
e);
throw new RuntimeException("Failed to get page content", e);
}
}
@Override
public List<Document> get() {
// Get the access token
String accessToken = this.accessToken;
// Get the resource type and resource ID for the OneNote resource
OneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();
String resourceId = this.oneNoteResource.getResourceId();
// Parameters check
Assert.notNull(accessToken, "token must not be null");
Assert.notNull(resourceType, "resource type must not be null");
Assert.notNull(resourceId, "resource id must not be null");
// Fetch content based on the resource type (Notebook, Section, or Page)
List<String> content = switch (resourceType) {
case NOTEBOOK -> getNoteBookContent(accessToken, resourceId);
case SECTION -> getSectionContent(accessToken, resourceId);
case PAGE -> Collections.singletonList(getPageContent(accessToken, resourceId));
};
// Build metadata for the resource
Map<String, Object> metaData = buildMetadata();
// Construct a list of Document objects
return content.stream().map(c -> new Document(c, metaData)).toList();
}
private String parseHtmlContent(String htmlContent) {
// Parse the HTML content
org.jsoup.nodes.Document parseDoc = Jsoup.parse(htmlContent);
// Get title and text content, ensuring title is not empty
String title = parseDoc.title();
String text = parseDoc.text();
// Return title and content in a readable format
return (StringUtils.hasText(title) ? title : "") + "\n" + text;
}
/**
* Builds metadata for a given OneNote resource (Notebook, Section, or Page) by
* querying the Microsoft Graph API.
*/
private Map<String, Object> buildMetadata() {
Map<String, Object> metadata = new HashMap<>();
String accessToken = this.accessToken;
String resourceId = this.oneNoteResource.getResourceId();
OneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();
String endpoint = switch (resourceType) {
case NOTEBOOK -> "/notebooks/";
case SECTION -> "/sections/";
case PAGE -> "/pages/";
};
String uriPath = MICROSOFT_GRAPH_BASE_URL + "/me/onenote" + endpoint + resourceId;
URI uri = URI.create(uriPath);
// Add basic metadata to the map (resource URI, type, and ID)
metadata.put(OneNoteResource.SOURCE, uriPath);
metadata.put("resourceType", resourceType.name());
metadata.put("resourceId", resourceId);
try {
HttpRequest request = HttpRequest.newBuilder()
.header("Authorization", accessToken)
.header("Content-Type", "application/json")
.uri(uri)
.GET()
.build();
HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());
Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");
// Parse the JSON response to extract relevant metadata fields
JsonObject jsonMetaData = JsonParser.parseString(response.body()).getAsJsonObject();
// Extract creation date and add to metadata if available
String createDateTime = Optional.ofNullable(jsonMetaData.get("createdDateTime"))
.map(JsonElement::getAsString)
.orElse(null);
if (StringUtils.hasText(createDateTime)) {
metadata.put("createdTime", Instant.parse(createDateTime).toEpochMilli());
}
// Extract last modified date and add to metadata if available
String lastModifiedDateTime = Optional.ofNullable(jsonMetaData.get("lastModifiedDateTime"))
.map(JsonElement::getAsString)
.orElse(null);
if (StringUtils.hasText(lastModifiedDateTime)) {
metadata.put("lastModifiedTime", Instant.parse(lastModifiedDateTime).toEpochMilli());
}
// Extract content URL and add to metadata if available
String contentURL = Optional.ofNullable(jsonMetaData.get("contentUrl"))
.map(JsonElement::getAsString)
.orElse(null);
if (StringUtils.hasText(contentURL)) {
metadata.put("contentURL", contentURL);
}
}
catch (Exception e) {
log.warn("Failed to get page content with token: {}, resourceId: {}, resourceType: {}, {}", accessToken,
resourceId, resourceType, e.getMessage(), e);
throw new RuntimeException("Failed to get page content", e);
}
return metadata;
}
}
OneNoteDocumentReader构造器要求输入accessToken及oneNoteResource,它会构建HttpClient,其get方法根据resourceType执行不同的逻辑,NOTEBOOK执行getNoteBookContent,SECTION执行getSectionContent,PAGE执行getPageContent,之后通过buildMetadata构建metaData,最后构建Document返回
getNoteBookContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id+eq+'notebookId'
提取pageIds,然后通过getPageContent获取内容
getSectionContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id+eq+'sessionId'
提取pageIds,然后通过getPageContent获取内容
getPageContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/pageId/content
获取html结果,再通过jsoup解析title、text,最后通过\n
拼接返回
buildMetadata方法根据不同的resourceType构建不同的请求uri,请求之后提取createdTime、lastModifiedTime、contentURL
OneNoteResource
community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteResource.java
public class OneNoteResource implements Resource {
public static final String SOURCE = "source";
public enum ResourceType {
NOTEBOOK, SECTION, PAGE
}
private final ResourceType resourceType;
private final String resourceId;
public ResourceType getResourceType() {
return resourceType;
}
public String getResourceId() {
return resourceId;
}
public OneNoteResource(String resourceId, ResourceType resourceType) {
Assert.hasText(resourceId, "ResourceId must not be empty");
Assert.notNull(resourceType, "ResourceType must not be null");
this.resourceId = resourceId;
this.resourceType = resourceType;
}
//......
}
OneNoteResource主要是定义了NOTEBOOK, SECTION, PAGE这三种resourceType以及resourceId
示例
community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/test/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReaderTest.java
@EnabledIfEnvironmentVariable(named = "ONENOTE_ACCESS_TOKEN", matches = ".+")
public class OneNoteDocumentReaderTest {
private static final String TEST_ACCESS_TOKEN = System.getenv("ONENOTE_ACCESS_TOKEN");
private static final String TEST_NOTEBOOK_ID = "${notebookId}";
private static final String TEST_SECTION_ID = "${sectionId}";
private static final String TEST_PAGE_ID = "${pageId}";
private OneNoteDocumentReader oneNoteDocumentReader;
static {
if (TEST_ACCESS_TOKEN == null || TEST_ACCESS_TOKEN.isEmpty()) {
System.out.println("ONENOTE_ACCESS_TOKEN environment variable is not set. Tests will be skipped.");
}
}
@Test
public void test_load_page() {
// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the test
Assumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),
"Skipping test because ONENOTE_ACCESS_TOKEN is not set");
// Create page reader
OneNoteResource oneNoteResource = OneNoteResource.builder()
.resourceId(TEST_PAGE_ID)
.resourceType(OneNoteResource.ResourceType.PAGE)
.build();
OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);
List<Document> documents = oneNoteDocumentReader.get();
// then
assertThat(documents).isNotEmpty();
Document document = documents.get(0);
// Verify metadata
assertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);
assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.PAGE.name());
assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_PAGE_ID);
// Verify content
String content = document.getText();
assertThat(content).isNotEmpty();
}
@Test
public void test_load_section() {
// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the test
Assumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),
"Skipping test because ONENOTE_ACCESS_TOKEN is not set");
// Create page reader
OneNoteResource oneNoteResource = OneNoteResource.builder()
.resourceId(TEST_SECTION_ID)
.resourceType(OneNoteResource.ResourceType.SECTION)
.build();
OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);
List<Document> documents = oneNoteDocumentReader.get();
// then
assertThat(documents).isNotEmpty();
Document document = documents.get(0);
// Verify metadata
assertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);
assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.SECTION.name());
assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_SECTION_ID);
// Verify content
String content = document.getText();
assertThat(content).isNotEmpty();
}
@Test
public void test_load_notebook() {
// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the test
Assumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),
"Skipping test because ONENOTE_ACCESS_TOKEN is not set");
// Create page reader
OneNoteResource oneNoteResource = OneNoteResource.builder()
.resourceId(TEST_NOTEBOOK_ID)
.resourceType(OneNoteResource.ResourceType.NOTEBOOK)
.build();
OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);
List<Document> documents = oneNoteDocumentReader.get();
// then
assertThat(documents).isNotEmpty();
Document document = documents.get(0);
// Verify metadata
assertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);
assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.NOTEBOOK.name());
assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_NOTEBOOK_ID);
// Verify content
String content = document.getText();
assertThat(content).isNotEmpty();
}
}
小结
spring-ai-alibaba-starter-document-reader-onenote提供了OneNoteDocumentReader用于根据accessToken、resourceId、resourceType去获取oneNote的内容及meta。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。