本文主要研究一下Spring AI Alibaba的PdfTablesParser

PdfTablesParser

community/document-parsers/spring-ai-alibaba-starter-document-parser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java

public class PdfTablesParser implements DocumentParser {

    /**
     * The page number of the PDF file to be parsed. Default value is 1.
     */
    private final Integer page;

    /**
     * The metadata of the PDF file to be parsed.
     */
    private final Map<String, String> metadata;

    public PdfTablesParser() {

        this(1);
    }

    public PdfTablesParser(Integer pageNumber) {

        this(pageNumber, Map.of());
    }

    public PdfTablesParser(Integer pageNumber, Map<String, String> metadata) {

        this.page = pageNumber;
        this.metadata = metadata;
    }

    @Override
    public List<Document> parse(InputStream inputStream) {

        try {
            return data2Document(parseTables(extraTableData(inputStream)));
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    protected List<Table> extraTableData(InputStream in) throws Exception {

        PDDocument document = PDDocument.load(in);

        // check pdf files
        int numberOfPages = document.getNumberOfPages();
        if (numberOfPages < 0) {

            throw new RuntimeException("No page found in the PDF file.");
        }

        if (page > numberOfPages) {

            throw new RuntimeException("The page number is greater than the number of pages in the PDF file.");
        }

        SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

        // extract page by page numbers.
        Page extract = new ObjectExtractor(document).extract(this.page);

        return sea.extract(extract);
    }

    protected List<String> parseTables(List<Table> data) {

        if (data.isEmpty()) {
            return Collections.emptyList();
        }

        return data.stream()
            .flatMap(table -> table.getRows()
                .stream()
                .map(cells -> cells.stream()
                    .map(content -> content.getText().replace("\r", "").replace("\n", " "))
                    .reduce((first, second) -> first + "|" + second)
                    .orElse("") + "|"))
            .collect(Collectors.toList());
    }

    private List<Document> data2Document(List<String> data) {

        List<Document> documents = new ArrayList<>();

        if (data.isEmpty()) {
            return null;
        }

        for (String datum : data) {
            Document doc = new Document(datum);
            documents.add(addMetadata(doc));
        }

        return documents;
    }

    private Document addMetadata(Document document) {

        if (metadata.isEmpty()) {
            return document;
        }

        for (Map.Entry<String, String> entry : metadata.entrySet()) {
            document.getMetadata().put(entry.getKey(), entry.getValue());
        }

        return document;
    }

}
PdfTablesParser使用tabula来解析pdf,它先执行extraTableData、再执行parseTables,最后执行data2Document;extraTableData方法使用SpreadsheetExtractionAlgorithm去解析为List<Table>,parseTables则将List<Table>解析为List<String>,data2Document方法则将List<String>解析为List<Document>

示例

class PdfTablesParserTests {

    private Resource resource;

    private Resource resource2;

    @BeforeEach
    void setUp() {

        resource = new DefaultResourceLoader().getResource("classpath:/pdf-tables.pdf");
        resource2 = new DefaultResourceLoader().getResource("classpath:/sample1.pdf");

        if (!resource.exists()) {
            throw new RuntimeException("Resource not found: " + resource);
        }
    }

    /**
     * tabula-java use.
     */
    @Test
    void PdfTableTest() throws IOException {

        InputStream in = new FileInputStream(resource.getFile());
        try (PDDocument document = PDDocument.load(in)) {
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            PageIterator pi = new ObjectExtractor(document).extract();
            while (pi.hasNext()) {
                // iterate over the pages of the document
                Page page = pi.next();
                List<Table> table = sea.extract(page);
                // iterate over the tables of the page
                for (Table tables : table) {
                    List<List<RectangularTextContainer>> rows = tables.getRows();
                    // iterate over the rows of the table
                    for (List<RectangularTextContainer> cells : rows) {
                        // print all column-cells of the row plus linefeed
                        for (RectangularTextContainer content : cells) {
                            // Note: Cell.getText() uses \r to concat text chunk
                            String text = content.getText().replace("\r", " ");
                            System.out.print(text + "|");
                        }
                        System.out.println();
                    }
                }
            }
        }

    }

    @Test
    void PdfTablesParseTest() throws IOException {

        String res = """
                |name|age|sex|
                |zhangsan|20|m|
                |lisi|21|w|
                |wangwu|22|m|
                |zhangliu|23|w|
                |songqi|24|w|
                """;

        InputStream in = new FileInputStream(resource.getFile());
        PdfTablesParser pdfTablesParser = new PdfTablesParser();
        List<Document> docs = pdfTablesParser.parse(in);

        StringBuilder sb = new StringBuilder();
        docs.subList(1, docs.size()).forEach(doc -> sb.append(doc.getText() + "\n"));

        Assert.equals(res, sb.toString());
    }

    @Test
    void PdfTablesParseTest2() throws IOException {

        String res = """
                Sample Date:|May 2001|
                Prepared by:|Accelio Present Applied Technology|
                Created and Tested Using:|•Accelio Present Central 5.4•Accelio Present Output Designer 5.4|
                Features Demonstrated:|•Primary bookmarks in a PDF file.•Secondary bookmarks in a PDF file.|
                """;

        InputStream in = new FileInputStream(resource2.getFile());
        PdfTablesParser pdfTablesParser = new PdfTablesParser();
        List<Document> docs = pdfTablesParser.parse(in);

        StringBuilder sb = new StringBuilder();
        docs.forEach(doc -> sb.append(doc.getText() + "\n"));

        Assert.equals(res, sb.toString());

    }

    @Test
    void PdfTablesParseTest3() throws IOException {

        String res = """
                |Filename|||escription|escription||
                |||||||
                ap_bookmark.IFD|The template design.||||||
                ap_bookmark.mdf|The template targeted for PDF output.||||||
                ap_bookmark.dat|A sample data file in DAT format.||||||
                ap_bookmark.bmk|A sample bookmark file.||||||
                ap_bookmark.pdf|Sample PDF output.||||||
                ap_bookmark_doc.pdf|A document describing the sample.||||||
                |To bookmark by|Use the command line parameter||
                |Invoices|-abmkap_bookmark.bmk -abmsinvoices||
                |Type|-abmkap_bookmark.bmk -abmstype||
                |Amount|-abmkap_bookmark.bmk -abmsamount||
                """;

        InputStream in = new FileInputStream(resource2.getFile());
        PdfTablesParser pdfTablesParser = new PdfTablesParser(3);
        List<Document> docs = pdfTablesParser.parse(in);

        StringBuilder sb = new StringBuilder();
        docs.forEach(doc -> sb.append(doc.getText() + "\n"));

        Assert.equals(res, sb.toString());

    }

}

小结

Spring AI Alibaba的spring-ai-alibaba-starter-document-parser-pdf-tables提供了PdfTablesParser用于解析pdf文件中的表格数据到Document。

doc


codecraft
11.9k 声望2k 粉丝

当一个代码的工匠回首往事时,不因虚度年华而悔恨,也不因碌碌无为而羞愧,这样,当他老的时候,可以很自豪告诉世人,我曾经将代码注入生命去打造互联网的浪潮之巅,那是个很疯狂的时代,我在一波波的浪潮上留下...