项目结构:
your_project/
│
├── main.py
├── requirements.txt
├── Dockerfile
├── run.sh
└── static/
├── input_pdfs/
├── output_pdfs/
└── output_txts/
requirements.txt:
fastapi
uvicorn
PyMuPDF
Dockerfile:
# 使用官方 Python 镜像
FROM python:3.11-slim
# 安装系统依赖(PyMuPDF 需要 libmupdf)
RUN apt-get update && apt-get install -y \
build-essential \
libmupdf-dev \
&& rm -rf /var/lib/apt/lists/*
# 设置工作目录
WORKDIR /app
# 拷贝代码
COPY . /app
# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt
# 创建静态目录(用于挂载或测试)
RUN mkdir -p /app/static
# 启动容器时运行 Uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8301"]
通过run.sh或者手动构建docker:
#!/bin/bash
#build image
sudo docker build -t pdf-redact-app .
sudo docker run -d \
-p 8301:8301 \
-v /data/desensitization_files:/app/static \
--name pdf-redactor \
pdf-redact-app
进入容器内部(调试用)
docker exec -it pdf-tuomin /bin/bash
文件脱敏助手源码:
# main.py
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import StreamingResponse
import uvicorn
import fitz
import os
from typing import List
from tempfile import NamedTemporaryFile
import io
from pathlib import Path
import zipfile
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 静态目录
STATIC_DIR = "/app/static"
Path(STATIC_DIR).mkdir(parents=True, exist_ok=True)
def redact_after_keyword(page, keyword):
redact_rects = []
blocks = page.get_text("blocks")
page_width = page.rect.width
for block in blocks:
block_text = block[4].strip()
if keyword in block_text:
keyword_rect = fitz.Rect(block[0], block[1], block[2], block[3])
mask_width = 255 if keyword == "借款人住所" else 100
mask_x1 = keyword_rect.x1 + mask_width
if mask_x1 > page_width:
continue
redact_rects.append(fitz.Rect(keyword_rect.x1, keyword_rect.y0, mask_x1, keyword_rect.y1))
for delim in [f"{keyword}:", f"{keyword}:"]:
for inst in page.search_for(delim):
if keyword == "手机号码" and inst.width > len(keyword) * 5:
mask_x1 = inst.x1 + 100
if mask_x1 > page_width:
continue
redact_rects.append(fitz.Rect(inst.x1, inst.y0, mask_x1, inst.y1))
return redact_rects
async def process_pdf(file: UploadFile, keywords: List[str]):
try:
file_content = await file.read()
doc = fitz.open(stream=file_content, filetype="pdf")
new_doc = False
full_text = ""
for page in doc:
full_text += page.get_text()
all_redact_rects = []
for keyword in keywords:
redact_rects = redact_after_keyword(page, keyword)
if redact_rects:
all_redact_rects.extend(redact_rects)
new_doc = True
for rect in all_redact_rects:
page.add_redact_annot(rect, fill=(0, 0, 0))
if all_redact_rects:
page.apply_redactions()
pdf_bytes = doc.tobytes() if new_doc else file_content
doc.close()
return {
"pdf_bytes": pdf_bytes,
"text_content": full_text,
"original_filename": file.filename,
"was_redacted": new_doc
}
except Exception as e:
raise HTTPException(status_code=400, detail=f"处理PDF文件时出错: {str(e)}")
def save_to_static(filename: str, content: bytes):
filepath = os.path.join(STATIC_DIR, filename)
with open(filepath, "wb") as f:
f.write(content)
return filepath
@app.post("/download_documents")
async def download_documents(
files: List[UploadFile] = File(...),
keywords: List[str] = Form(...)
):
try:
if "银行账号" in keywords:
keywords.extend(["还款账户号", "收款账户号"])
if "身份证号" in keywords:
keywords.extend(["证件号码"])
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for file in files:
if not file.filename.lower().endswith(".pdf"):
continue
result = await process_pdf(file, keywords)
original_name = os.path.splitext(file.filename)[0]
pdf_filename = f"redacted_{original_name}.pdf"
txt_filename = f"{original_name}.txt"
save_to_static(pdf_filename, result["pdf_bytes"])
save_to_static(txt_filename, result["text_content"].encode('utf-8'))
zip_file.writestr(pdf_filename, result["pdf_bytes"])
zip_file.writestr(txt_filename, result["text_content"].encode('utf-8'))
zip_buffer.seek(0)
return StreamingResponse(
zip_buffer,
media_type="application/zip",
headers={"Content-Disposition": "attachment; filename=processed_documents.zip"}
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")
if __name__ == "__main__":
uvicorn.run("main:app", host="0.0.0.0", port=8301, reload=True)
脱敏前:
脱敏后:
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。