Python: 基于指定单号批量合并PDF文件

Exploring

发布于 2024-11-23 09:13:11

6200

代码可运行

文章被收录于专栏：数据处理与编程实践数据处理与编程实践

运行总次数：0

代码可运行

文章背景： 在工作中，有时需要合并指定单号的PDF文件。比如需要将指定单号的测试数据合并为一份文件。

解决思路： 合并PDF文件，可以使用第三方模块，PyMuPDF模块。如果电脑上一开始没安装的话，可以通过pip install pymupdf手动安装。

一开始笔者用的是PyPDF4模块，但是如果需要合并一百页以上的PDF文档，PyPDF4模块的性能明显不如PyMuPDF模块，所以后来改用了PyMuPDF模块。

完整的Python代码如下：

# coding=utf-8

import os
import re
import time
import fitz   # PyMuPDF
from datetime import datetime


def delete_file(f):
    os.remove(f)


def find_files_with_string(directory, search_string):

    # 创建一个空列表用于存储结果
    found_files = []

    # 遍历指定文件夹中的所有文件（不包括子文件夹）
    for file in os.listdir(directory):
        # 获取文件的完整路径
        full_path = os.path.join(directory, file)
        # 检查是否是文件（排除文件夹）
        if os.path.isfile(full_path):
            # 检查文件名中是否包含指定的字符串(不区分大小写)
            if search_string.upper() in file.upper():
                # 将文件路径保存在列表中
                found_files.append(full_path)

    return found_files


def ask_yes_no(prompt):
    while True:
        response = input(f"{prompt} (yes/no): ").strip().lower()
        if response in ["yes", "y"]:
            return True
        elif response in ["no", "n"]:
            return False
        else:
            print("Please answer 'yes' or 'no'.")


def extract_info(s):
    # 定义正则表达式来提取7位数字后的字母和数字；字母个数，字母，数字
    match = re.search(r'\d{7}-([A-Za-z]+)(\d+)', s)
    if match:
        letters = match.group(1)
        numbers = match.group(2)
        return len(letters), letters, int(numbers)
    else:
        return 0, '', 0  # 如果未找到匹配项，返回默认值


def custom_sort(lst):
    # 按字母个数、字母顺序和数字顺序排序
    return sorted(lst, key=lambda x: extract_info(x))


def merge_pdf(input_path, ss, output_path):
    """ 合并PDF文件到指定路径，删除子PDF文件(可选)
    """

    # 开始计时
    start_time = time.time()

    # 1 判断输入路径
    if not os.path.isdir(input_path):
        print("输入路径不存在，请确认！")
        return

    # 2 判断单号是否存在
    last_seven = ss[-7:]
    if len(ss) < 7:
        print("样品单号少于7位，请确认！")
        return
    elif not last_seven.isdigit():
        print("样品单号最后7位不全是数字，请确认！")
        return

    # 3 判断输出路径是否存在
    if not os.path.isdir(output_path):
        print("输出路径不存在，请确认！")
        return

    # 4 查找PDF文件
    result = find_files_with_string(input_path, ss)
    num_result = len(result)
    if num_result == 0:
        print("指定单号的PDF数据文件不存在，请确认！")
        return
    else:
        # 列表自定义排序
        result_sorted = custom_sort(result)

        # 2) 合并PDF文件
        msg = f"总共要合并 {num_result} 份文件？"
        user_response = ask_yes_no(msg)
        if user_response:
            # 获取临时文件名
            filename = ss + ".pdf"
            file_path_output = os.path.join(output_path, filename)

            # 判断文件是否已存在
            if os.path.isfile(file_path_output):
                print("提示", "文件已存在，请确认！\n" + file_path_output)
                return

            # 合并pdf文件

            # 将文件分成每50个一组
            file_groups = [result_sorted[i:i + 50] for i in range(0, len(result_sorted), 50)]

            merged_files = []

            for i, file_group in enumerate(file_groups):
                file_merger = fitz.Document()

                for file in file_group:
                    if file.endswith("pdf"):
                        pdf_document = fitz.Document(file)
                        file_merger.insert_pdf(pdf_document)
                        pdf_document.close()  # Close the document after inserting it

                # 获取当前的时间戳
                timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

                # 在文件名中添加时间戳
                merged_file_path = f"merged_{i}_{timestamp}.pdf"
                file_merger.save(merged_file_path)
                file_merger.close()

                merged_files.append(merged_file_path)

            # 创建一个新的合并器，将所有合并后的文件再次合并
            final_merger = fitz.Document()

            for file in merged_files:
                pdf_documents = fitz.Document(file)
                final_merger.insert_pdf(pdf_documents)
                pdf_documents.close()  # Close the document after inserting it

            # 保存最终的合并文件
            final_merger.save(file_path_output)
            final_merger.close()

            # 删除中间生成的合并文件
            for file in merged_files:
                os.remove(file)

            # 使用默认的PDF阅读器打开PDF文件
            os.startfile(file_path_output)

            # 删除单份的PDF文件(可选的步骤)
            # for file in result_sorted:
            #    if file.endswith("pdf"):
            #        delete_file(file)

            # 结束计时
            end_time = time.time()

            # 计算总耗时
            elapsed_time = round(end_time - start_time)

            print(f"Done!\n共合并了 {num_result} 份PDF文档!\n总耗时：{elapsed_time} 秒")

        else:
            print("提示", "未执行合并任务!")


if __name__ == '__main__':
    # 示例
    input_path_1 = "C:\\Local\\origin"
    sample_no = "BYD24-0010001"
    output_path_1 = "C:\\Local\\target"

    merge_pdf(input_path_1, sample_no, output_path_1)

效果演示：

总共要合并 3 份文件？(yes/no): yes
Done!
共合并了 3 份PDF文档!
总耗时：5 秒

（1）为了提高运行效率，如果需要合并50份以上的PDF文件，代码中进行了优化，每50份文件合并成一份，最后再汇总到一起。

参考资料：

[1] ChatGPT AI生成

[2] Python: 基于正则表达式自定义排序规则

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2024-09-08，如有侵权请联系 cloudcommunity@tencent.com 删除

python

本文分享自数据处理与编程实践微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

登录后参与评论

0 条评论

热度

Python: 基于指定单号批量合并PDF文件

Python: 基于指定单号批量合并PDF文件

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐