序
本文主要研究一下Spring AI Alibaba的ObsidianDocumentReader
ObsidianDocumentReader
community/document-readers/spring-ai-alibaba-starter-document-reader-obsidian/src/main/java/com/alibaba/cloud/ai/reader/obsidian/ObsidianDocumentReader.java
public class ObsidianDocumentReader implements DocumentReader {
private final Path vaultPath;
private final MarkdownDocumentParser parser;
/**
* Constructor for reading all files in vault
* @param vaultPath Path to Obsidian vault
*/
public ObsidianDocumentReader(Path vaultPath) {
this.vaultPath = vaultPath;
this.parser = new MarkdownDocumentParser();
}
@Override
public List<Document> get() {
List<Document> allDocuments = new ArrayList<>();
// Find all markdown files in vault
List<ObsidianResource> resources = ObsidianResource.findAllMarkdownFiles(vaultPath);
// Parse each file
for (ObsidianResource resource : resources) {
try {
List<Document> documents = parser.parse(resource.getInputStream());
String source = resource.getSource();
// Add metadata to each document
for (Document doc : documents) {
doc.getMetadata().put(ObsidianResource.SOURCE, source);
}
allDocuments.addAll(documents);
}
catch (IOException e) {
throw new RuntimeException("Failed to read Obsidian file: " + resource.getFilePath(), e);
}
}
return allDocuments;
}
public static Builder builder() {
return new Builder();
}
public static class Builder {
private Path vaultPath;
public Builder vaultPath(Path vaultPath) {
this.vaultPath = vaultPath;
return this;
}
public ObsidianDocumentReader build() {
return new ObsidianDocumentReader(vaultPath);
}
}
}
ObsidianDocumentReader的get方法通过ObsidianResource.findAllMarkdownFiles(vaultPath)来读取ObsidianResource,之后遍历resources使用MarkdownDocumentParser进行解析
ObsidianResource
community/document-readers/spring-ai-alibaba-starter-document-reader-obsidian/src/main/java/com/alibaba/cloud/ai/reader/obsidian/ObsidianResource.java
public class ObsidianResource implements Resource {
public static final String SOURCE = "source";
public static final String MARKDOWN_EXTENSION = ".md";
private final Path vaultPath;
private final Path filePath;
private final InputStream inputStream;
/**
* Constructor for single file
* @param vaultPath Path to Obsidian vault
* @param filePath Path to markdown file
*/
public ObsidianResource(Path vaultPath, Path filePath) {
Assert.notNull(vaultPath, "VaultPath must not be null");
Assert.notNull(filePath, "FilePath must not be null");
Assert.isTrue(Files.exists(vaultPath), "Vault directory does not exist: " + vaultPath);
Assert.isTrue(Files.exists(filePath), "File does not exist: " + filePath);
Assert.isTrue(filePath.toString().endsWith(MARKDOWN_EXTENSION), "File must be a markdown file: " + filePath);
this.vaultPath = vaultPath;
this.filePath = filePath;
try {
this.inputStream = new FileInputStream(filePath.toFile());
}
catch (IOException e) {
throw new RuntimeException("Failed to create input stream for file: " + filePath, e);
}
}
/**
* Find all markdown files in the vault Recursively searches through all
* subdirectories Only includes .md files and ignores hidden files/directories
* @param vaultPath Root path of the Obsidian vault
* @return List of ObsidianResource for each markdown file
*/
public static List<ObsidianResource> findAllMarkdownFiles(Path vaultPath) {
Assert.notNull(vaultPath, "VaultPath must not be null");
Assert.isTrue(Files.exists(vaultPath), "Vault directory does not exist: " + vaultPath);
Assert.isTrue(Files.isDirectory(vaultPath), "VaultPath must be a directory: " + vaultPath);
List<ObsidianResource> resources = new ArrayList<>();
try (Stream<Path> paths = Files.walk(vaultPath)) {
paths
// Only include .md files
.filter(path -> path.toString().endsWith(MARKDOWN_EXTENSION))
// Ignore hidden files and files in hidden directories
.filter(path -> {
Path relativePath = vaultPath.relativize(path);
String[] pathParts = relativePath.toString().split("/");
for (String part : pathParts) {
if (part.startsWith(".")) {
return false;
}
}
return true;
})
// Only include regular files (not directories)
.filter(Files::isRegularFile)
.forEach(path -> resources.add(new ObsidianResource(vaultPath, path)));
}
catch (IOException e) {
throw new RuntimeException("Failed to walk vault directory: " + vaultPath, e);
}
return resources;
}
//......
}
ObsidianResource构造器要求输入vaultPath和filePath,其findAllMarkdownFiles方法会遍历vaultPath目录,找出.md
结尾的文件
示例
community/document-readers/spring-ai-alibaba-starter-document-reader-obsidian/src/test/java/com/alibaba/cloud/ai/reader/obsidian/ObsidianDocumentReaderIT.java
@EnabledIfEnvironmentVariable(named = "OBSIDIAN_VAULT_PATH", matches = ".+")
class ObsidianDocumentReaderIT {
private static final String VAULT_PATH = System.getenv("OBSIDIAN_VAULT_PATH");
// Static initializer to log a message if environment variable is not set
static {
if (VAULT_PATH == null || VAULT_PATH.isEmpty()) {
System.out.println("Skipping Obsidian tests because OBSIDIAN_VAULT_PATH environment variable is not set.");
}
}
ObsidianDocumentReader reader;
@BeforeEach
void setUp() {
// Only initialize if VAULT_PATH is set
if (VAULT_PATH != null && !VAULT_PATH.isEmpty()) {
reader = ObsidianDocumentReader.builder().vaultPath(Path.of(VAULT_PATH)).build();
}
}
@Test
void should_read_markdown_files() {
// Skip test if reader is null
Assumptions.assumeTrue(reader != null, "Skipping test because ObsidianDocumentReader could not be initialized");
// when
List<Document> documents = reader.get();
// then
assertThat(documents).isNotEmpty();
// Verify document content and metadata
for (Document doc : documents) {
// Verify source metadata
assertThat(doc.getMetadata()).containsKey(ObsidianResource.SOURCE);
String source = doc.getMetadata().get(ObsidianResource.SOURCE).toString();
assertThat(source).isNotEmpty().endsWith(ObsidianResource.MARKDOWN_EXTENSION);
// Verify content
assertThat(doc.getText()).isNotEmpty();
// Print for debugging
System.out.println("Document source: " + source);
if (doc.getMetadata().containsKey("category")) {
System.out.println("Document category: " + doc.getMetadata().get("category"));
}
System.out.println("Document content: " + doc.getText());
System.out.println("---");
}
}
}
小结
spring-ai-alibaba-starter-document-reader-obsidian提供了ObsidianDocumentReader用于读取指定仓库(vaultPath
)下的所有markdown文件,之后使用MarkdownDocumentParser去解析为List<Document>
。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。