Docker部署文件脱敏助手

原创

happywei

修改于 2025-08-08 09:26:56

1830

项目结构：

your_project/
│
├── main.py
├── requirements.txt
├── Dockerfile
├── run.sh
└── static/
    ├── input_pdfs/
    ├── output_pdfs/
    └── output_txts/

requirements.txt：

fastapi
uvicorn
PyMuPDF

Dockerfile：

# 使用官方 Python 镜像
FROM python:3.11-slim

# 安装系统依赖（PyMuPDF 需要 libmupdf）
RUN apt-get update && apt-get install -y \
    build-essential \
    libmupdf-dev \
    && rm -rf /var/lib/apt/lists/*

# 设置工作目录
WORKDIR /app

# 拷贝代码
COPY . /app

# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt

# 创建静态目录（用于挂载或测试）
RUN mkdir -p /app/static

# 启动容器时运行 Uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8301"]

通过run.sh或者手动构建docker：

#!/bin/bash
#build image
sudo docker build -t pdf-redact-app .    

sudo docker run -d \
  -p 8301:8301 \
  -v /data/desensitization_files:/app/static \
  --name pdf-redactor \
  pdf-redact-app

进入容器内部（调试用）

docker exec -it pdf-tuomin /bin/bash

文件脱敏助手源码：

# main.py
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import StreamingResponse
import uvicorn
import fitz
import os
from typing import List
from tempfile import NamedTemporaryFile
import io
from pathlib import Path
import zipfile

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# 静态目录
STATIC_DIR = "/app/static"
Path(STATIC_DIR).mkdir(parents=True, exist_ok=True)

def redact_after_keyword(page, keyword):
    redact_rects = []
    blocks = page.get_text("blocks")
    page_width = page.rect.width

    for block in blocks:
        block_text = block[4].strip()
        if keyword in block_text:
            keyword_rect = fitz.Rect(block[0], block[1], block[2], block[3])
            mask_width = 255 if keyword == "借款人住所" else 100
            mask_x1 = keyword_rect.x1 + mask_width
            if mask_x1 > page_width:
                continue
            redact_rects.append(fitz.Rect(keyword_rect.x1, keyword_rect.y0, mask_x1, keyword_rect.y1))

    for delim in [f"{keyword}：", f"{keyword}:"]:
        for inst in page.search_for(delim):
            if keyword == "手机号码" and inst.width > len(keyword) * 5:
                mask_x1 = inst.x1 + 100
                if mask_x1 > page_width:
                    continue
                redact_rects.append(fitz.Rect(inst.x1, inst.y0, mask_x1, inst.y1))

    return redact_rects

async def process_pdf(file: UploadFile, keywords: List[str]):
    try:
        file_content = await file.read()
        doc = fitz.open(stream=file_content, filetype="pdf")
        new_doc = False
        full_text = ""

        for page in doc:
            full_text += page.get_text()
            all_redact_rects = []

            for keyword in keywords:
                redact_rects = redact_after_keyword(page, keyword)
                if redact_rects:
                    all_redact_rects.extend(redact_rects)
                    new_doc = True

            for rect in all_redact_rects:
                page.add_redact_annot(rect, fill=(0, 0, 0))
            if all_redact_rects:
                page.apply_redactions()

        pdf_bytes = doc.tobytes() if new_doc else file_content
        doc.close()

        return {
            "pdf_bytes": pdf_bytes,
            "text_content": full_text,
            "original_filename": file.filename,
            "was_redacted": new_doc
        }
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"处理PDF文件时出错: {str(e)}")

def save_to_static(filename: str, content: bytes):
    filepath = os.path.join(STATIC_DIR, filename)
    with open(filepath, "wb") as f:
        f.write(content)
    return filepath

@app.post("/download_documents")
async def download_documents(
    files: List[UploadFile] = File(...),
    keywords: List[str] = Form(...)
):
    try:
        if "银行账号" in keywords:
            keywords.extend(["还款账户号", "收款账户号"])
        if "身份证号" in keywords:
            keywords.extend(["证件号码"])

        zip_buffer = io.BytesIO()

        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for file in files:
                if not file.filename.lower().endswith(".pdf"):
                    continue

                result = await process_pdf(file, keywords)
                original_name = os.path.splitext(file.filename)[0]
                pdf_filename = f"redacted_{original_name}.pdf"
                txt_filename = f"{original_name}.txt"

                save_to_static(pdf_filename, result["pdf_bytes"])
                save_to_static(txt_filename, result["text_content"].encode('utf-8'))

                zip_file.writestr(pdf_filename, result["pdf_bytes"])
                zip_file.writestr(txt_filename, result["text_content"].encode('utf-8'))

        zip_buffer.seek(0)
        return StreamingResponse(
            zip_buffer,
            media_type="application/zip",
            headers={"Content-Disposition": "attachment; filename=processed_documents.zip"}
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")

if __name__ == "__main__":
    uvicorn.run("main:app", host="0.0.0.0", port=8301, reload=True)

脱敏前：