序
本文主要研究一下Spring AI Alibaba的NotionDocumentReader
NotionDocumentReader
community/document-readers/spring-ai-alibaba-starter-document-reader-notion/src/main/java/com/alibaba/cloud/ai/reader/notion/NotionDocumentReader.java
public class NotionDocumentReader implements DocumentReader {
private final NotionResource notionResource;
private final JsonNode pageMetadata;
/**
* Constructor
* @param notionResource Notion resource
*/
public NotionDocumentReader(NotionResource notionResource) {
this.notionResource = notionResource;
this.pageMetadata = notionResource.getMetadata();
}
@Override
public List<Document> get() {
try {
// Read content from input stream
String content = readContent();
// Create metadata map
Map<String, Object> metadata = buildMetadata();
// Create and return document
return Collections.singletonList(new Document(content, metadata));
}
catch (IOException e) {
throw new RuntimeException("Failed to load document from Notion: " + e.getMessage(), e);
}
}
/**
* Build metadata map from Notion API response
*/
private Map<String, Object> buildMetadata() {
Map<String, Object> metadata = new HashMap<>();
// Add basic metadata
metadata.put(NotionResource.SOURCE, notionResource.getSource());
metadata.put("resourceType", notionResource.getResourceType().name());
metadata.put("resourceId", notionResource.getResourceId());
// Add metadata from Notion API
if (pageMetadata != null) {
// Creation and update times
String createdTime = pageMetadata.get("created_time").asText();
if (StringUtils.hasText(createdTime)) {
metadata.put("createdTime", Instant.parse(createdTime).toEpochMilli());
}
String lastEditedTime = pageMetadata.get("last_edited_time").asText();
if (StringUtils.hasText(lastEditedTime)) {
metadata.put("lastEditedTime", Instant.parse(lastEditedTime).toEpochMilli());
}
// Creator and last editor
JsonNode createdBy = pageMetadata.get("created_by");
if (createdBy != null) {
String creatorName = createdBy.get("name").asText();
String creatorId = createdBy.get("id").asText();
if (StringUtils.hasText(creatorName)) {
metadata.put("createdBy", creatorName);
}
if (StringUtils.hasText(creatorId)) {
metadata.put("createdById", creatorId);
}
}
JsonNode lastEditedBy = pageMetadata.get("last_edited_by");
if (lastEditedBy != null) {
String editorName = lastEditedBy.get("name").asText();
String editorId = lastEditedBy.get("id").asText();
if (StringUtils.hasText(editorName)) {
metadata.put("lastEditedBy", editorName);
}
if (StringUtils.hasText(editorId)) {
metadata.put("lastEditedById", editorId);
}
}
// URL
String url = pageMetadata.get("url").asText();
if (StringUtils.hasText(url)) {
metadata.put("url", url);
}
// Parent information
JsonNode parent = pageMetadata.get("parent");
if (parent != null) {
String parentType = parent.get("type").asText();
if (StringUtils.hasText(parentType)) {
metadata.put("parentType", parentType);
String parentId = parent.get(parentType + "_id").asText();
if (StringUtils.hasText(parentId)) {
metadata.put("parentId", parentId);
}
}
}
// Icon
JsonNode icon = pageMetadata.get("icon");
if (icon != null) {
String iconType = icon.get("type").asText();
String iconUrl = icon.get("url").asText();
if (StringUtils.hasText(iconType)) {
metadata.put("iconType", iconType);
}
if (StringUtils.hasText(iconUrl)) {
metadata.put("iconUrl", iconUrl);
}
}
// Cover
JsonNode cover = pageMetadata.get("cover");
if (cover != null) {
String coverType = cover.get("type").asText();
String coverUrl = cover.get("url").asText();
if (StringUtils.hasText(coverType)) {
metadata.put("coverType", coverType);
}
if (StringUtils.hasText(coverUrl)) {
metadata.put("coverUrl", coverUrl);
}
}
}
return metadata;
}
/**
* Read content from input stream
*/
private String readContent() throws IOException {
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(notionResource.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
return content.toString();
}
}
NotionDocumentReader实现了DocumentReader接口,其get方法先执行readContent获取content,再通过buildMetadata构建metadata;readContent方法从notionResource.getInputStream()读取数据;buildMetadata方法设置了notionResource.getSource()、notionResource.getResourceType()、notionResource.getResourceId(),对于pageMetadata不为null的提取created_time、last_edited_time、created_by的id、name、last_edited_by的id、name、url、parent、icon、cover信息
NotionResource
community/document-readers/spring-ai-alibaba-starter-document-reader-notion/src/main/java/com/alibaba/cloud/ai/reader/notion/NotionResource.java
public class NotionResource implements Resource {
public static final String SOURCE = "source";
private static final String BASE_URL = "https://api.notion.com/v1";
private static final String API_VERSION = "2022-06-28";
// Resource types
public enum ResourceType {
PAGE, DATABASE
}
private final HttpClient httpClient;
private final InputStream inputStream;
private final URI uri;
private final ResourceType resourceType;
private final String resourceId;
private JsonNode metadata;
/**
* Constructor
* @param notionToken Notion API Token
* @param resourceId Notion resource ID
* @param resourceType Resource type (PAGE or DATABASE)
*/
public NotionResource(String notionToken, String resourceId, ResourceType resourceType) {
Assert.hasText(resourceId, "ResourceId must not be empty");
Assert.notNull(resourceType, "ResourceType must not be null");
this.resourceId = resourceId;
this.resourceType = resourceType;
this.httpClient = HttpClient.newBuilder().version(HttpClient.Version.HTTP_2).build();
validateToken(notionToken);
// Get resource metadata
this.metadata = getResourceMetadata(notionToken, resourceId, resourceType);
// Get content based on resource type
String content = switch (resourceType) {
case PAGE -> getPageContent(notionToken, resourceId);
case DATABASE -> getDatabaseContent(notionToken, resourceId);
};
this.inputStream = new ByteArrayInputStream(content.getBytes());
this.uri = URI.create(String.format("notion://%s/%s", resourceType.name().toLowerCase(), resourceId));
}
//......
}
NotionResource的构造器要求输入notionToken、resourceId、resourceType,它创建httpClient,并校验notionToken,之后通过getResourceMetadata获取metadata,再根据resourceType读取内容,若是PAGE执行getPageContent方法,若是DATABASE执行getDatabaseContent
getResourceMetadata
/**
* Get resource metadata
*/
private JsonNode getResourceMetadata(String notionToken, String resourceId, ResourceType resourceType) {
try {
String endpoint = switch (resourceType) {
case PAGE -> "/pages/";
case DATABASE -> "/databases/";
};
URI uri = URI.create(BASE_URL + endpoint + resourceId);
HttpRequest request = HttpRequest.newBuilder()
.header("Authorization", "Bearer " + notionToken)
.header("Notion-Version", API_VERSION)
.uri(uri)
.GET()
.build();
HttpResponse<String> response = this.httpClient.send(request, HttpResponse.BodyHandlers.ofString());
Assert.isTrue(response.statusCode() == 200, "Failed to fetch resource metadata");
ObjectMapper objectMapper = new ObjectMapper();
return objectMapper.readTree(response.body());
}
catch (Exception e) {
throw new RuntimeException("Failed to get resource metadata", e);
}
}
getResourceMetadata方法通过请求/pages/resourceId或者/databases/resourceId来获取metadata
getPageContent
/**
* Get page content
*/
private String getPageContent(String notionToken, String pageId) {
try {
// 1. Get page content
URI pageUri = URI.create(BASE_URL + "/pages/" + pageId);
HttpRequest pageRequest = HttpRequest.newBuilder()
.header("Authorization", "Bearer " + notionToken)
.header("Notion-Version", API_VERSION)
.uri(pageUri)
.GET()
.build();
HttpResponse<String> pageResponse = this.httpClient.send(pageRequest, HttpResponse.BodyHandlers.ofString());
Assert.isTrue(pageResponse.statusCode() == 200, "Failed to fetch page content");
// 2. Parse page content
ObjectMapper objectMapper = new ObjectMapper();
JsonNode pageJson = objectMapper.readTree(pageResponse.body());
StringBuilder content = new StringBuilder();
// Extract page title
JsonNode properties = pageJson.get("properties");
if (properties != null && properties.has("title")) {
JsonNode titleProp = properties.get("title");
JsonNode titleArray = titleProp.get("title");
if (titleArray != null && titleArray.isArray()) {
for (JsonNode titleNode : titleArray) {
content.append(titleNode.get("plain_text").asText());
}
content.append("\n\n");
}
}
// 3. Get page blocks
URI blocksUri = URI.create(BASE_URL + "/blocks/" + pageId + "/children");
HttpRequest blocksRequest = HttpRequest.newBuilder()
.header("Authorization", "Bearer " + notionToken)
.header("Notion-Version", API_VERSION)
.uri(blocksUri)
.GET()
.build();
HttpResponse<String> blocksResponse = this.httpClient.send(blocksRequest,
HttpResponse.BodyHandlers.ofString());
Assert.isTrue(blocksResponse.statusCode() == 200, "Failed to fetch page blocks");
// 4. Parse block content
JsonNode blocksJson = objectMapper.readTree(blocksResponse.body());
JsonNode blocks = blocksJson.get("results");
// 5. Extract text content
if (blocks != null && blocks.isArray()) {
for (JsonNode block : blocks) {
String type = block.get("type").asText();
if (block.has(type)) {
JsonNode typeObj = block.get(type);
if (typeObj.has("rich_text")) {
JsonNode richText = typeObj.get("rich_text");
for (JsonNode textNode : richText) {
content.append(textNode.get("plain_text").asText());
}
content.append("\n");
}
}
}
}
return content.toString();
}
catch (Exception e) {
throw new RuntimeException("Failed to get page content", e);
}
}
getPageContent方法通过请求/pages/pageId获取内容,然后json解析读取properties属性,再请求/blocks/pageId/children获取block信息,解析results提取rich_text或者plain_text
getDatabaseContent
/**
* Get database content
*/
private String getDatabaseContent(String notionToken, String databaseId) {
try {
// 1. Query database
URI uri = URI.create(BASE_URL + "/databases/" + databaseId + "/query");
HttpRequest request = HttpRequest.newBuilder()
.header("Authorization", "Bearer " + notionToken)
.header("Notion-Version", API_VERSION)
.header("Content-Type", "application/json")
.uri(uri)
.POST(HttpRequest.BodyPublishers.ofString("{}"))
.build();
HttpResponse<String> response = this.httpClient.send(request, HttpResponse.BodyHandlers.ofString());
Assert.isTrue(response.statusCode() == 200, "Failed to fetch database content");
// 2. Parse database content
ObjectMapper objectMapper = new ObjectMapper();
JsonNode jsonResponse = objectMapper.readTree(response.body());
JsonNode results = jsonResponse.get("results");
// 3. Extract property values
StringBuilder content = new StringBuilder();
if (results != null && results.isArray()) {
for (JsonNode row : results) {
JsonNode properties = row.get("properties");
for (Iterator<String> it = properties.fieldNames(); it.hasNext();) {
String propertyName = it.next();
JsonNode property = properties.get(propertyName);
String type = property.get("type").asText();
if (property.has(type)) {
JsonNode value = property.get(type);
if (value.isArray()) {
for (JsonNode item : value) {
if (item.has("plain_text")) {
content.append(propertyName)
.append(": ")
.append(item.get("plain_text").asText())
.append("\n");
}
}
}
}
}
content.append("---\n");
}
}
return content.toString();
}
catch (Exception e) {
throw new RuntimeException("Failed to get database content", e);
}
}
getDatabaseContent通过请求/databases/databaseId/query来查询,之后解析results读取properties及plain_text信息
示例
community/document-readers/spring-ai-alibaba-starter-document-reader-notion/src/test/java/com/alibaba/cloud/ai/reader/notion/NotionDocumentReaderIT.java
@EnabledIfEnvironmentVariable(named = "NOTION_TOKEN", matches = ".+")
class NotionDocumentReaderIT {
private static final String NOTION_TOKEN = System.getenv("NOTION_TOKEN");
// Test page ID
private static final String TEST_PAGE_ID = "${pageId}";
// Test database ID
private static final String TEST_DATABASE_ID = "${databaseId}";
// Static initializer to log a message if environment variable is not set
static {
if (NOTION_TOKEN == null || NOTION_TOKEN.isEmpty()) {
System.out.println("Skipping Notion tests because NOTION_TOKEN environment variable is not set.");
}
}
NotionDocumentReader pageReader;
NotionDocumentReader databaseReader;
@BeforeEach
public void beforeEach() {
// Only initialize if NOTION_TOKEN is set
if (NOTION_TOKEN != null && !NOTION_TOKEN.isEmpty()) {
// Create page reader
NotionResource pageResource = NotionResource.builder()
.notionToken(NOTION_TOKEN)
.resourceId(TEST_PAGE_ID)
.resourceType(NotionResource.ResourceType.PAGE)
.build();
pageReader = new NotionDocumentReader(pageResource);
// Create database reader
NotionResource databaseResource = NotionResource.builder()
.notionToken(NOTION_TOKEN)
.resourceId(TEST_DATABASE_ID)
.resourceType(NotionResource.ResourceType.DATABASE)
.build();
databaseReader = new NotionDocumentReader(databaseResource);
}
}
@Test
void should_load_page() {
// Skip test if pageReader is null
Assumptions.assumeTrue(pageReader != null,
"Skipping test because NotionDocumentReader could not be initialized");
// when
List<Document> documents = pageReader.get();
// then
assertThat(documents).isNotEmpty();
Document document = documents.get(0);
// Verify metadata
assertThat(document.getMetadata()).containsKey(NotionResource.SOURCE);
assertThat(document.getMetadata().get(NotionResource.SOURCE)).isEqualTo("notion://page/" + TEST_PAGE_ID);
assertThat(document.getMetadata().get("resourceType")).isEqualTo(NotionResource.ResourceType.PAGE.name());
assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_PAGE_ID);
// Verify content
String content = document.getText();
assertThat(content).isNotEmpty();
System.out.println("Page content: " + content);
}
@Test
void should_load_database() {
// Skip test if databaseReader is null
Assumptions.assumeTrue(databaseReader != null,
"Skipping test because NotionDocumentReader could not be initialized");
// when
List<Document> documents = databaseReader.get();
// then
assertThat(documents).isNotEmpty();
Document document = documents.get(0);
// Verify metadata
assertThat(document.getMetadata()).containsKey(NotionResource.SOURCE);
assertThat(document.getMetadata().get(NotionResource.SOURCE))
.isEqualTo("notion://database/" + TEST_DATABASE_ID);
assertThat(document.getMetadata().get("resourceType")).isEqualTo(NotionResource.ResourceType.DATABASE.name());
assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_DATABASE_ID);
// Verify content
String content = document.getText();
assertThat(content).isNotEmpty();
System.out.println("Database content: " + content);
}
}
小结
spring-ai-alibaba-starter-document-reader-notion提供了NotionDocumentReader用于读取notion文档,其readContent方法从notionResource.getInputStream()读取数据;NotionResource的构造器要求输入notionToken、resourceId、resourceType,它创建httpClient,并校验notionToken,之后通过getResourceMetadata获取metadata,再根据resourceType读取内容,若是PAGE执行getPageContent方法,若是DATABASE执行getDatabaseContent。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。