本文主要研究一下Spring AI Alibaba的OneNoteDocumentReader

OneNoteDocumentReader

community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReader.java

public class OneNoteDocumentReader implements DocumentReader {

  public static final String MICROSOFT_GRAPH_BASE_URL = "https://graph.microsoft.com/v1.0";

  public static final String NOTEBOOK_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id";

  public static final String SECTION_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id";

  private static final Logger log = LoggerFactory.getLogger(OneNoteDocumentReader.class);

  private final OneNoteResource oneNoteResource;

  private final HttpClient client;

  private final String accessToken;

  public OneNoteDocumentReader(String accessToken, OneNoteResource oneNoteResource) {
    this.accessToken = accessToken;
    this.oneNoteResource = oneNoteResource;
    this.client = HttpClient.newBuilder().version(HttpClient.Version.HTTP_2).build();
  }

  /**
   * Retrieves the content of a OneNote notebook by querying the Microsoft Graph API.
   */
  private List<String> getNoteBookContent(String accessToken, String notebookId) {
    // Build the URI for fetching pages from the notebook
    String uri = MICROSOFT_GRAPH_BASE_URL + NOTEBOOK_ID_FILTER_PREFIX + "+eq+" + "'" + notebookId + "'";

    // Get the page IDs from the notebook by querying the API
    List<String> pageIdsFromNotebook = getOneNotePageIdsByURI(accessToken, uri);

    // Fetch the content for each page by its ID
    return pageIdsFromNotebook.stream().map(id -> getPageContent(accessToken, id)).toList();
  }

  /**
   * Retrieves the content of a OneNote section by querying the Microsoft Graph API.
   */
  private List<String> getSectionContent(String accessToken, String sectionId) {
    // Build the URI for fetching pages from the section
    String uri = MICROSOFT_GRAPH_BASE_URL + SECTION_ID_FILTER_PREFIX + "+eq+" + "'" + sectionId + "'";

    // Get the page IDs from the notebook by querying the API
    List<String> pageIdsBySection = getOneNotePageIdsByURI(accessToken, uri);

    // Fetch the content for each page by its ID
    return pageIdsBySection.stream().map(id -> getPageContent(accessToken, id)).toList();
  }

  private List<String> getOneNotePageIdsByURI(String accessToken, String uri) {
    HttpRequest request = HttpRequest.newBuilder()
      .header("Authorization", accessToken)
      .header("Content-Type", "application/json")
      .uri(URI.create(uri))
      .GET()
      .build();

    try {
      HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());
      Assert.isTrue(response.statusCode() == 200, "Failed to fetch pages information");
      // Parse JSON response and extract page IDs
      return parsePageIdsFromJson(response.body());
    }
    catch (Exception e) {
      throw new RuntimeException("Failed to get pages id", e);
    }
  }

  /**
   * Parses the JSON response and extracts page IDs
   */
  private List<String> parsePageIdsFromJson(String jsonResponse) {
    JsonObject rootObject = JsonParser.parseString(jsonResponse).getAsJsonObject();
    JsonArray valueArray = rootObject.getAsJsonArray("value");

    return valueArray.asList()
      .stream()
      .map(jsonElement -> jsonElement.getAsJsonObject().get("id").getAsString())
      .toList();
  }

  /**
   * Retrieves the content of a specific OneNote page by querying the Microsoft Graph
   * API.
   */
  private String getPageContent(String accessToken, String pageId) {
    URI uri = URI.create(MICROSOFT_GRAPH_BASE_URL + "/me/onenote/pages/" + pageId + "/content");
    HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).uri(uri).GET().build();
    try {
      HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());
      Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");
      return parseHtmlContent(response.body());
    }
    catch (Exception e) {
      log.warn("Failed to get page content with token: {}, pageId: {}, {}", accessToken, pageId, e.getMessage(),
          e);
      throw new RuntimeException("Failed to get page content", e);
    }
  }

  @Override
  public List<Document> get() {
    // Get the access token
    String accessToken = this.accessToken;
    // Get the resource type and resource ID for the OneNote resource
    OneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();
    String resourceId = this.oneNoteResource.getResourceId();

    // Parameters check
    Assert.notNull(accessToken, "token must not be null");
    Assert.notNull(resourceType, "resource type must not be null");
    Assert.notNull(resourceId, "resource id must not be null");

    // Fetch content based on the resource type (Notebook, Section, or Page)
    List<String> content = switch (resourceType) {
      case NOTEBOOK -> getNoteBookContent(accessToken, resourceId);
      case SECTION -> getSectionContent(accessToken, resourceId);
      case PAGE -> Collections.singletonList(getPageContent(accessToken, resourceId));
    };

    // Build metadata for the resource
    Map<String, Object> metaData = buildMetadata();

    // Construct a list of Document objects
    return content.stream().map(c -> new Document(c, metaData)).toList();
  }

  private String parseHtmlContent(String htmlContent) {
    // Parse the HTML content
    org.jsoup.nodes.Document parseDoc = Jsoup.parse(htmlContent);

    // Get title and text content, ensuring title is not empty
    String title = parseDoc.title();
    String text = parseDoc.text();

    // Return title and content in a readable format
    return (StringUtils.hasText(title) ? title : "") + "\n" + text;
  }

  /**
   * Builds metadata for a given OneNote resource (Notebook, Section, or Page) by
   * querying the Microsoft Graph API.
   */
  private Map<String, Object> buildMetadata() {
    Map<String, Object> metadata = new HashMap<>();
    String accessToken = this.accessToken;
    String resourceId = this.oneNoteResource.getResourceId();
    OneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();
    String endpoint = switch (resourceType) {
      case NOTEBOOK -> "/notebooks/";
      case SECTION -> "/sections/";
      case PAGE -> "/pages/";
    };
    String uriPath = MICROSOFT_GRAPH_BASE_URL + "/me/onenote" + endpoint + resourceId;
    URI uri = URI.create(uriPath);

    // Add basic metadata to the map (resource URI, type, and ID)
    metadata.put(OneNoteResource.SOURCE, uriPath);
    metadata.put("resourceType", resourceType.name());
    metadata.put("resourceId", resourceId);

    try {
      HttpRequest request = HttpRequest.newBuilder()
        .header("Authorization", accessToken)
        .header("Content-Type", "application/json")
        .uri(uri)
        .GET()
        .build();

      HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());
      Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");

      // Parse the JSON response to extract relevant metadata fields
      JsonObject jsonMetaData = JsonParser.parseString(response.body()).getAsJsonObject();

      // Extract creation date and add to metadata if available
      String createDateTime = Optional.ofNullable(jsonMetaData.get("createdDateTime"))
        .map(JsonElement::getAsString)
        .orElse(null);
      if (StringUtils.hasText(createDateTime)) {
        metadata.put("createdTime", Instant.parse(createDateTime).toEpochMilli());
      }

      // Extract last modified date and add to metadata if available
      String lastModifiedDateTime = Optional.ofNullable(jsonMetaData.get("lastModifiedDateTime"))
        .map(JsonElement::getAsString)
        .orElse(null);
      if (StringUtils.hasText(lastModifiedDateTime)) {
        metadata.put("lastModifiedTime", Instant.parse(lastModifiedDateTime).toEpochMilli());
      }

      // Extract content URL and add to metadata if available
      String contentURL = Optional.ofNullable(jsonMetaData.get("contentUrl"))
        .map(JsonElement::getAsString)
        .orElse(null);
      if (StringUtils.hasText(contentURL)) {
        metadata.put("contentURL", contentURL);
      }

    }
    catch (Exception e) {
      log.warn("Failed to get page content with token: {}, resourceId: {}, resourceType: {}, {}", accessToken,
          resourceId, resourceType, e.getMessage(), e);
      throw new RuntimeException("Failed to get page content", e);
    }
    return metadata;
  }

}
OneNoteDocumentReader构造器要求输入accessToken及oneNoteResource,它会构建HttpClient,其get方法根据resourceType执行不同的逻辑,NOTEBOOK执行getNoteBookContent,SECTION执行getSectionContent,PAGE执行getPageContent,之后通过buildMetadata构建metaData,最后构建Document返回
getNoteBookContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id+eq+'notebookId'提取pageIds,然后通过getPageContent获取内容
getSectionContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id+eq+'sessionId'提取pageIds,然后通过getPageContent获取内容
getPageContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/pageId/content获取html结果,再通过jsoup解析title、text,最后通过\n拼接返回
buildMetadata方法根据不同的resourceType构建不同的请求uri,请求之后提取createdTime、lastModifiedTime、contentURL

OneNoteResource

community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteResource.java

public class OneNoteResource implements Resource {

  public static final String SOURCE = "source";

  public enum ResourceType {

    NOTEBOOK, SECTION, PAGE

  }

  private final ResourceType resourceType;

  private final String resourceId;

  public ResourceType getResourceType() {
    return resourceType;
  }

  public String getResourceId() {
    return resourceId;
  }

  public OneNoteResource(String resourceId, ResourceType resourceType) {
    Assert.hasText(resourceId, "ResourceId must not be empty");
    Assert.notNull(resourceType, "ResourceType must not be null");

    this.resourceId = resourceId;
    this.resourceType = resourceType;
  }

  //......
}  
OneNoteResource主要是定义了NOTEBOOK, SECTION, PAGE这三种resourceType以及resourceId

示例

community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/test/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReaderTest.java

@EnabledIfEnvironmentVariable(named = "ONENOTE_ACCESS_TOKEN", matches = ".+")
public class OneNoteDocumentReaderTest {

  private static final String TEST_ACCESS_TOKEN = System.getenv("ONENOTE_ACCESS_TOKEN");

  private static final String TEST_NOTEBOOK_ID = "${notebookId}";

  private static final String TEST_SECTION_ID = "${sectionId}";

  private static final String TEST_PAGE_ID = "${pageId}";

  private OneNoteDocumentReader oneNoteDocumentReader;

  static {
    if (TEST_ACCESS_TOKEN == null || TEST_ACCESS_TOKEN.isEmpty()) {
      System.out.println("ONENOTE_ACCESS_TOKEN environment variable is not set. Tests will be skipped.");
    }
  }

  @Test
  public void test_load_page() {
    // Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the test
    Assumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),
        "Skipping test because ONENOTE_ACCESS_TOKEN is not set");

    // Create page reader
    OneNoteResource oneNoteResource = OneNoteResource.builder()
      .resourceId(TEST_PAGE_ID)
      .resourceType(OneNoteResource.ResourceType.PAGE)
      .build();
    OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);

    List<Document> documents = oneNoteDocumentReader.get();
    // then
    assertThat(documents).isNotEmpty();
    Document document = documents.get(0);

    // Verify metadata
    assertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);
    assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.PAGE.name());
    assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_PAGE_ID);

    // Verify content
    String content = document.getText();
    assertThat(content).isNotEmpty();
  }

  @Test
  public void test_load_section() {
    // Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the test
    Assumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),
        "Skipping test because ONENOTE_ACCESS_TOKEN is not set");

    // Create page reader
    OneNoteResource oneNoteResource = OneNoteResource.builder()
      .resourceId(TEST_SECTION_ID)
      .resourceType(OneNoteResource.ResourceType.SECTION)
      .build();
    OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);

    List<Document> documents = oneNoteDocumentReader.get();
    // then
    assertThat(documents).isNotEmpty();
    Document document = documents.get(0);

    // Verify metadata
    assertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);
    assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.SECTION.name());
    assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_SECTION_ID);

    // Verify content
    String content = document.getText();
    assertThat(content).isNotEmpty();
  }

  @Test
  public void test_load_notebook() {
    // Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the test
    Assumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),
        "Skipping test because ONENOTE_ACCESS_TOKEN is not set");

    // Create page reader
    OneNoteResource oneNoteResource = OneNoteResource.builder()
      .resourceId(TEST_NOTEBOOK_ID)
      .resourceType(OneNoteResource.ResourceType.NOTEBOOK)
      .build();
    OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);

    List<Document> documents = oneNoteDocumentReader.get();
    // then
    assertThat(documents).isNotEmpty();
    Document document = documents.get(0);

    // Verify metadata
    assertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);
    assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.NOTEBOOK.name());
    assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_NOTEBOOK_ID);

    // Verify content
    String content = document.getText();
    assertThat(content).isNotEmpty();
  }

}

小结

spring-ai-alibaba-starter-document-reader-onenote提供了OneNoteDocumentReader用于根据accessToken、resourceId、resourceType去获取oneNote的内容及meta。

doc


codecraft
11.9k 声望2k 粉丝

当一个代码的工匠回首往事时,不因虚度年华而悔恨,也不因碌碌无为而羞愧,这样,当他老的时候,可以很自豪告诉世人,我曾经将代码注入生命去打造互联网的浪潮之巅,那是个很疯狂的时代,我在一波波的浪潮上留下...