首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >专栏 >Docker部署文件脱敏助手

Docker部署文件脱敏助手

原创
作者头像
happywei
修改2025-08-08 09:26:56
修改2025-08-08 09:26:56
14300
代码可运行
举报
运行总次数:0
代码可运行

项目结构:

代码语言:batch
复制
your_project/
│
├── main.py
├── requirements.txt
├── Dockerfile
├── run.sh
└── static/
    ├── input_pdfs/
    ├── output_pdfs/
    └── output_txts/

requirements.txt:

代码语言:batch
复制
fastapi
uvicorn
PyMuPDF

Dockerfile:

代码语言:batch
复制
# 使用官方 Python 镜像
FROM python:3.11-slim

# 安装系统依赖(PyMuPDF 需要 libmupdf)
RUN apt-get update && apt-get install -y \
    build-essential \
    libmupdf-dev \
    && rm -rf /var/lib/apt/lists/*

# 设置工作目录
WORKDIR /app

# 拷贝代码
COPY . /app

# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt

# 创建静态目录(用于挂载或测试)
RUN mkdir -p /app/static

# 启动容器时运行 Uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8301"]

通过run.sh或者手动构建docker:

代码语言:batch
复制
#!/bin/bash
#build image
sudo docker build -t pdf-redact-app .    

sudo docker run -d \
  -p 8301:8301 \
  -v /data/desensitization_files:/app/static \
  --name pdf-redactor \
  pdf-redact-app

进入容器内部(调试用)

代码语言:batch
复制
docker exec -it pdf-tuomin /bin/bash

文件脱敏助手源码:

代码语言:python
代码运行次数:0
运行
复制
# main.py
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import StreamingResponse
import uvicorn
import fitz
import os
from typing import List
from tempfile import NamedTemporaryFile
import io
from pathlib import Path
import zipfile

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# 静态目录
STATIC_DIR = "/app/static"
Path(STATIC_DIR).mkdir(parents=True, exist_ok=True)

def redact_after_keyword(page, keyword):
    redact_rects = []
    blocks = page.get_text("blocks")
    page_width = page.rect.width

    for block in blocks:
        block_text = block[4].strip()
        if keyword in block_text:
            keyword_rect = fitz.Rect(block[0], block[1], block[2], block[3])
            mask_width = 255 if keyword == "借款人住所" else 100
            mask_x1 = keyword_rect.x1 + mask_width
            if mask_x1 > page_width:
                continue
            redact_rects.append(fitz.Rect(keyword_rect.x1, keyword_rect.y0, mask_x1, keyword_rect.y1))

    for delim in [f"{keyword}:", f"{keyword}:"]:
        for inst in page.search_for(delim):
            if keyword == "手机号码" and inst.width > len(keyword) * 5:
                mask_x1 = inst.x1 + 100
                if mask_x1 > page_width:
                    continue
                redact_rects.append(fitz.Rect(inst.x1, inst.y0, mask_x1, inst.y1))

    return redact_rects

async def process_pdf(file: UploadFile, keywords: List[str]):
    try:
        file_content = await file.read()
        doc = fitz.open(stream=file_content, filetype="pdf")
        new_doc = False
        full_text = ""

        for page in doc:
            full_text += page.get_text()
            all_redact_rects = []

            for keyword in keywords:
                redact_rects = redact_after_keyword(page, keyword)
                if redact_rects:
                    all_redact_rects.extend(redact_rects)
                    new_doc = True

            for rect in all_redact_rects:
                page.add_redact_annot(rect, fill=(0, 0, 0))
            if all_redact_rects:
                page.apply_redactions()

        pdf_bytes = doc.tobytes() if new_doc else file_content
        doc.close()

        return {
            "pdf_bytes": pdf_bytes,
            "text_content": full_text,
            "original_filename": file.filename,
            "was_redacted": new_doc
        }
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"处理PDF文件时出错: {str(e)}")

def save_to_static(filename: str, content: bytes):
    filepath = os.path.join(STATIC_DIR, filename)
    with open(filepath, "wb") as f:
        f.write(content)
    return filepath

@app.post("/download_documents")
async def download_documents(
    files: List[UploadFile] = File(...),
    keywords: List[str] = Form(...)
):
    try:
        if "银行账号" in keywords:
            keywords.extend(["还款账户号", "收款账户号"])
        if "身份证号" in keywords:
            keywords.extend(["证件号码"])

        zip_buffer = io.BytesIO()

        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for file in files:
                if not file.filename.lower().endswith(".pdf"):
                    continue

                result = await process_pdf(file, keywords)
                original_name = os.path.splitext(file.filename)[0]
                pdf_filename = f"redacted_{original_name}.pdf"
                txt_filename = f"{original_name}.txt"

                save_to_static(pdf_filename, result["pdf_bytes"])
                save_to_static(txt_filename, result["text_content"].encode('utf-8'))

                zip_file.writestr(pdf_filename, result["pdf_bytes"])
                zip_file.writestr(txt_filename, result["text_content"].encode('utf-8'))

        zip_buffer.seek(0)
        return StreamingResponse(
            zip_buffer,
            media_type="application/zip",
            headers={"Content-Disposition": "attachment; filename=processed_documents.zip"}
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")

if __name__ == "__main__":
    uvicorn.run("main:app", host="0.0.0.0", port=8301, reload=True)

脱敏前:

脱敏后:

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档