我如何使用Llamaindex和LangChain索引PDF中的表格和文本?(我正在使用Openai密钥)

3
def ask(file):
    print(" Loading...")
    PDFReader = download_loader("PDFReader")
    loader = PDFReader()
    documents = loader.load_data(file=Path(file))
    print("Path: ", Path(file))

    # Check if the index file exists
    if os.path.exists(INDEX_FILE):
        # Load the index from the file
        logger.info("found index.json in the directory")
        index = GPTSimpleVectorIndex.load_from_disk(INDEX_FILE)
    else:
        logger.info("didnt find index.json in the directory")
        llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))

        service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=1024)
        index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)

        # Save the index to the file
        index.save_to_disk(INDEX_FILE)

上面是我生成PDF索引代码片段的代码。我使用llamahub中的PDFReader从pdf文件中提取文本内容。当询问文本时,机器人回答得很好。但是当我询问pdf表格中的值时,它就失败了。
我尝试使用不同的OpenAI文本模型,其中最好的一个是text-davinci-003。 机器人无法回答我关于pdf表格中的值问题。这是因为pdfReader只是将pdf内容转换为文本(它没有采取任何特殊步骤来转换表格内容)。我想知道如何使用langchain和llamaindex成功地为pdf索引文本和表格。
1个回答

0
我知道你正在寻找开源的解决方案,但我想与你分享一下,使用 Adobe API 可以实现这一点;你可以从这里免费试用:Adobe API Developer。 以下是 Python 函数(确保从 Adobe API 中获取 'private.Key'(https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/)。点击开始试用后,你将在下载到你的计算机上的压缩文件中找到该文件。
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
    ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
import logging
import os
import re
import zipfile
import json
import glob
import pandas as pd

def adobeAPI(base_path, file_path):
    # Your code for the AdobeAPI function
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

    try:

        # Initial setup, create credentials instance.
        credentials = Credentials.service_account_credentials_builder() \
            .from_file("/path/to/pdfservices-api-credentials.json") \
            .build()

        # Create an ExecutionContext using credentials and create a new operation instance.
        execution_context = ExecutionContext.create(credentials)
        extract_pdf_operation = ExtractPDFOperation.create_new()

        # Set operation input from a source file.
        source = FileRef.create_from_local_file(file_path)
        extract_pdf_operation.set_input(source)

        # Build ExtractPDF options and set them into the operation
        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
            .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
                                                  ExtractRenditionsElementType.FIGURES]) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)

        # set a path for the zipped file
        outputzip = os.path.join(base_path, "output", str(
            get_filename(file_path)+".zip"))

        # set a path for the extracted zipped file
        outputzipextract = os.path.join(
            base_path, "output", str(get_filename(file_path)))

        # Save the result to the specified location.
        result.save_as(outputzip)
    except (ServiceApiException, ServiceUsageException, SdkException):
        logging.exception("Exception encountered while executing operation")

    # Open the ZIP file
    with zipfile.ZipFile(outputzip, 'r') as zip_ref:
        # Extract all the contents of the ZIP file to the current working directory
        zip_ref.extractall(path=outputzipextract)

    # Opening JSON file
    with open(os.path.join(outputzipextract, "structuredData.json")) as json_file:
        data = json.load(json_file)



    # get the list of .xlsx files
    List_xlsx_files = []
    xlsx_files = glob.glob(os.path.join(
        outputzipextract, "tables", "*.xlsx"))
    for file in xlsx_files:
        List_xlsx_files.append(file)


    list_of_values = list(range(len(data['elements'])-1))


    filename = get_filename(file_path)
    with open(os.path.join(outputzipextract, str(filename + '.txt')), "w", encoding='utf-8') as file:

        concatenated_string = ""

        for sec_index in list_of_values:

            pattern_figure = r"Figure$"
            match_figure = re.search(
                pattern_figure, data['elements'][int(sec_index)]['Path'])

            pattern_table_all = r"\/Table(?:\[\d+\])?$"
            match_table_all = re.search(
                pattern_table_all, data['elements'][int(sec_index)]['Path'])

            pattern_table_part = r"/Table(?:\[\d+\])?/"
            match_table_part = re.search(
                pattern_table_part, data['elements'][int(sec_index)]['Path'])

            if match_figure or match_table_part:
                continue

            elif match_table_all:

                xlsx_file = List_xlsx_files[0]
                match = re.search(r'(?<=\\)[^\\]*$', xlsx_file)
                xlsx_file = match.group(0)
                dfs_fixed_dict = get_dict_xlsx(outputzipextract, xlsx_file)
                json_string = json.dumps(dfs_fixed_dict)
                concatenated_string = concatenated_string + "\n" + json_string

                List_xlsx_files.pop(0)  # removing the used xlsx file

            elif 'Text' in data['elements'][int(sec_index)]:
                concatenated_string = concatenated_string + \
                    "\n" + data['elements'][int(sec_index)]['Text']

            else:
                continue


        file.write(concatenated_string)


    localfile = os.path.join(outputzipextract, str(filename + '.txt'))

    return localfile


############################ < Function to get filename out of path>##################


def get_filename(file_path):
    pattern = r'[/\\]([^/\\]+)\.pdf$'
    match = re.search(pattern, file_path)
    if match:
        return match.group(1)
    else:
        return None

############################ </ Function to get filename out of path>##################



#################### < Function to get a dictionary of Excel files>##################

def get_dict_xlsx(outputzipextract, xlsx_file):

    dfs = pd.read_excel(os.path.join(
        outputzipextract, "tables", xlsx_file), sheet_name='Sheet1', engine='openpyxl')

    # Convert the DataFrame to a dictionary
    data_dict = dfs.to_dict(orient='records')


    cleaned_data_dict = [
        {re.sub(r'_x[0-9a-fA-F]{4}_', '', k).strip()
                : re.sub(r'_x[0-9a-fA-F]{4}_', '', v).strip() for k, v in item.items()}
        for item in data_dict
    ]

    return cleaned_data_dict

#################### </Function to get a dictionary of Excel files>##################


这是运行代码之前的文件结构:

enter image description here

运行后,您将拥有一个名为“output”的文件夹,在其中可以找到您的PDF的txt版本。当您打开txt文件时,您会注意到其中的表格以json格式呈现。GPT能够读取json并在尝试回答您的问题时加以考虑。 在我的PDF文件中,有一个表格:

Pump_Table

您的txt文件包含了这个的JSON格式。
[{
    "No.": "1",
    "Equipment": "Pump",
    "Plant": "A1",
    "Tag": "P-1"
}, {
    "No.": "2",
    "Equipment": "Tank",
    "Plant": "A2",
    "Tag": "T-1"
}, {
    "No.": "3",
    "Equipment": "Heat Exchanger",
    "Plant": "A3",
    "Tag": "HE-1"
}, {
    "No.": "4",
    "Equipment": "Vessel",
    "Plant": "A4",
    "Tag": "V-1"
}]

运行代码后,这是文件结构:

enter image description here

希望能有所帮助。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接