序
本文主要研究一下langchain4j+poi读取文档
步骤
pom.xml
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j</artifactId>
<version>1.0.0-beta1</version>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-document-parser-apache-poi</artifactId>
<version>1.0.0-beta1</version>
</dependency>
example
public class POITest {
public static void main(String[] args) {
String path = System.getProperty("user.home") + "/downloads/tmp.xlsx";
DocumentParser parser = new ApachePoiDocumentParser();
Document document = FileSystemDocumentLoader.loadDocument(path, parser);
log.info("textSegment:{}", document.toTextSegment());
log.info("meta data:{}", document.metadata().toMap());
log.info("text:{}", document.text());
}
}
指定好了文件路径,通过ApachePoiDocumentParser来解析,最后统一返回Document对象,它可以返回textSegment,这个可以跟向量数据库结合在一起
EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel();
TextSegment segment1 = document.toTextSegment();
Embedding embedding1 = embeddingModel.embed(segment1).content();
embeddingStore.add(embedding1, segment1);
源码
Document
langchain4j-core/src/main/java/dev/langchain4j/data/document/Document.java
public interface Document {
/**
* Common metadata key for the name of the file from which the document was loaded.
*/
String FILE_NAME = "file_name";
/**
* Common metadata key for the absolute path of the directory from which the document was loaded.
*/
String ABSOLUTE_DIRECTORY_PATH = "absolute_directory_path";
/**
* Common metadata key for the URL from which the document was loaded.
*/
String URL = "url";
/**
* Returns the text of this document.
*
* @return the text.
*/
String text();
/**
* Returns the metadata associated with this document.
*
* @return the metadata.
*/
Metadata metadata();
/**
* Looks up the metadata value for the given key.
*
* @param key the key to look up.
* @return the metadata value for the given key, or null if the key is not present.
* @deprecated as of 0.31.0, use {@link #metadata()} and then {@link Metadata#getString(String)},
* {@link Metadata#getInteger(String)}, {@link Metadata#getLong(String)}, {@link Metadata#getFloat(String)},
* {@link Metadata#getDouble(String)} instead.
*/
@Deprecated(forRemoval = true)
default String metadata(String key) {
return metadata().get(key);
}
/**
* Builds a {@link TextSegment} from this document.
*
* @return a {@link TextSegment}
*/
default TextSegment toTextSegment() {
return TextSegment.from(text(), metadata().copy().put("index", "0"));
}
/**
* Creates a new Document from the given text.
*
* <p>The created document will have empty metadata.</p>
*
* @param text the text of the document.
* @return a new Document.
*/
static Document from(String text) {
return new DefaultDocument(text);
}
/**
* Creates a new Document from the given text.
*
* @param text the text of the document.
* @param metadata the metadata of the document.
* @return a new Document.
*/
static Document from(String text, Metadata metadata) {
return new DefaultDocument(text, metadata);
}
/**
* Creates a new Document from the given text.
*
* <p>The created document will have empty metadata.</p>
*
* @param text the text of the document.
* @return a new Document.
*/
static Document document(String text) {
return from(text);
}
/**
* Creates a new Document from the given text.
*
* @param text the text of the document.
* @param metadata the metadata of the document.
* @return a new Document.
*/
static Document document(String text, Metadata metadata) {
return from(text, metadata);
}
}
Document是个接口,提供了text方法返回文档的文本内容,metadata方法返回文档的元数据信息,默认实现是DefaultDocument
DefaultDocument
langchain4j-core/src/main/java/dev/langchain4j/data/document/DefaultDocument.java
public record DefaultDocument(String text, Metadata metadata) implements Document {
public DefaultDocument {
ensureNotBlank(text, "text");
ensureNotNull(metadata, "metadata");
}
public DefaultDocument(String text) {
this(text, new Metadata());
}
@Override
public String metadata(String key) {
return metadata.get(key);
}
}
Metadata
langchain4j-core/src/main/java/dev/langchain4j/data/document/Metadata.java
public class Metadata {
private static final Set<Class<?>> SUPPORTED_VALUE_TYPES = new LinkedHashSet<>();
static {
SUPPORTED_VALUE_TYPES.add(String.class);
SUPPORTED_VALUE_TYPES.add(UUID.class);
SUPPORTED_VALUE_TYPES.add(int.class);
SUPPORTED_VALUE_TYPES.add(Integer.class);
SUPPORTED_VALUE_TYPES.add(long.class);
SUPPORTED_VALUE_TYPES.add(Long.class);
SUPPORTED_VALUE_TYPES.add(float.class);
SUPPORTED_VALUE_TYPES.add(Float.class);
SUPPORTED_VALUE_TYPES.add(double.class);
SUPPORTED_VALUE_TYPES.add(Double.class);
}
private final Map<String, Object> metadata;
/**
* Construct a Metadata object with an empty map of key-value pairs.
*/
public Metadata() {
this.metadata = new HashMap<>();
}
/**
* Constructs a Metadata object from a map of key-value pairs.
*
* @param metadata the map of key-value pairs; must not be {@code null}. {@code null} values are not permitted.
* Supported value types: {@link String}, {@link Integer}, {@link Long}, {@link Float}, {@link Double}
*/
public Metadata(Map<String, ?> metadata) {
ensureNotNull(metadata, "metadata").forEach((key, value) -> {
validate(key, value);
if (!SUPPORTED_VALUE_TYPES.contains(value.getClass())) {
throw illegalArgument("The metadata key '%s' has the value '%s', which is of the unsupported type '%s'. " +
"Currently, the supported types are: %s",
key, value, value.getClass().getName(), SUPPORTED_VALUE_TYPES
);
}
});
this.metadata = new HashMap<>(metadata);
}
//......
}
Metadata本质是用Map来存储元数据信息
小结
langchain4j提供了langchain4j-document-parser-apache-poi用于自动读取办公文档,然后解析成Document类型,它可以返回textSegment,这个可以跟向量数据库结合在一起。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。