高质量编码--Python提取与合并Excel数据

原创

MiaoGIS

发布于 2023-03-06 09:33:40

78900

代码可运行

文章被收录于专栏：Python in AI-IOTPython in AI-IOT

运行总次数：0

代码可运行

假如遇到下面场景，不同项目使用不同的Excel文件跟踪进度，每周在该Excel新增一个sheet表。同时有多个这样的项目Excel文件。需要将所有项目文件中的信息提取合并到一个Excel文件中方便统计。

合并后的Excel

为方便使用，将此功能升级开发为一个Web页面

之所以可以提取各Excel sheet中的数据，前提示它们满足一定的模板格式，为方便找到格式错误，同时提供了模板格式检查的功能。

下面看一下后台代码实现：

excel_tools.py用作Excel指定信息提取以及合并

# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
import os

import logging
import warnings
warnings.filterwarnings('ignore')
handler=logging.FileHandler('log.txt',mode='w')
logger=logging.getLogger(__name__)
logger.addHandler(handler)
def parse_week(filePath,sheet):
    _,fileNameExt=os.path.split(filePath)
    fileName,_=os.path.splitext(fileNameExt)
     

    df=pd.read_excel(filePath,sheet_name=sheet)

    row1=df.iloc[0,1:]
    name=row1[row1.notna()].values[0]

    
    columns1=df.iloc[:,0].str.strip()
    df.set_index(columns1,inplace=True)
    df.drop(columns=[df.columns[0]],inplace=True)
    
    dfTable=df.loc['本周任务':'下周计划']
    dfTable.reset_index(inplace=True)
    dfHead=dfTable.iloc[1]
    dfBody=dfTable.iloc[2:-1]
    dfBody.columns=dfHead
    
    dfBody.rename(columns=lambda x:str(x).strip(),inplace=True)
    #dfBody=dfBody.iloc[:,[5,-2]]
    dfBody=dfBody[['责任人','实际工作量(h)']]
    
    dfBody.dropna(inplace=True)
    dfBody=dfBody.assign(项目名称=name,文件来源=fileName,sheet=sheet)
    logger.setLevel(logging.DEBUG)
    logger.log(logging.DEBUG,f'{name}:{len(dfBody)}个人')
    return dfBody

def combine_week(dirPath,mode,customs):
     
    files=os.listdir(dirPath)

    sheets=[]
    sheetDict={}
    dfs=[]
    for file in files:
        filePath=os.path.join(dirPath,file)
        excel=pd.ExcelFile(filePath)
        sheetDict[file]=excel.sheet_names
        if mode=='first':
            sheetName=excel.sheet_names[0]
            sheets.append((file,sheetName))
        if mode=='last':
            sheetName=excel.sheet_names[-1]
            sheets.append((file,sheetName))
    if mode=='preview':
        pass
    if mode=='custom':
        
        customs=[(file,sheetName) for file,sheetName in customs if file in files]
            
        sheets=customs 
    errorDict={}
    print('sheets',sheets)
    for file,sheetName in sheets:
        errorDict.setdefault(file,{})
        try:
            filePath=os.path.join(dirPath,file)
            df0=parse_week(filePath,sheetName)
        except:
            print('error',file,sheetName)
            errorDict[file][sheetName]=True
            df0=None
        print(df0)
        dfs.append(df0)
     
    if dfs and not list(filter(lambda x:isinstance(x,None.__class__),dfs)):
        df=pd.concat(dfs)
    else:
        df=None
        
    print('sheetDict',sheetDict)
    return df,sheetDict,errorDict

if __name__=='__main__':
    
    pass

server.py使用tornado Web框架开发后台

# -*- coding:utf-8 -*-
import tornado.web
import tornado.httpserver
from tornado.options import options,define
import tornado.ioloop
import os

import pandas as pd
import json
from excel_tools import *
define('port',default=8000,type=int)


class BaseHandler(tornado.web.RequestHandler):
    def get_current_user(self):
        user=self.get_secure_cookie('user',None)
        return user
class listHandler(BaseHandler):
    @tornado.web.authenticated
    def get(self):
        print('list')
        self.render('list.html',username=self.current_user)
     
    def post(self):
        username=self.get_argument('username')
        data=Excel.filter(owner=username)
        data=list(data.dicts())
        print(username)
        self.write({'result':data})
      

class combineHandler(tornado.web.RequestHandler):
    def get(self):
        
        self.render('combine.html',df=None,result=None,errorDict=None,sheetDict=None)
    def post(self):
        df=None
        mode=self.get_argument('mode','first')
        sheetDict=None
        errorDict=None
        customs=None
        sheets=self.get_argument('sheets',None)
        print('sheets',sheets)
        if sheets:
            sheets=sheets.split(',')
            customs=[sheet.split('<>') for sheet in sheets]
            
            print(customs)
        files=self.request.files['file']
        dirPath='./static/combine/'+str(uuid.uuid1())
        os.mkdir(dirPath)
        
        for file in files:
            filename=file['filename']
         
            with open(dirPath+'/'+filename,'wb') as f:    
                f.write(file.body)
       
        df,sheetDict,errorDict=combine_week(dirPath,mode=mode,customs=customs)
        
        print(sheetDict)
        self.render('combine.html',df=df,errorDict=errorDict,sheetDict=sheetDict)

        
class App(tornado.web.Application):
    def __init__(self,*args,**kwargs):  
        tornado.web.Application.__init__(self,*args,**kwargs)

if __name__=='__main__':
    
    handlers=[('/combine',combineHandler),
                  ]
    template_path=os.path.join('.','template')
    static_path=os.path.join('.','static')
    app=App(handlers=handlers,template_path=template_path,
            static_path=static_path,login_url='/login',
            debug=True,cookie_secret='miao')
    options.parse_command_line()
    server=tornado.httpserver.HTTPServer(app)
    server.listen(options.port)
    tornado.ioloop.IOLoop().current().start()

前端页面combine.html代码如下：

<!doctype html>
<html lang="zh-CN">
<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta HTTP-EQUIV="pragma" CONTENT="no-cache">
    <!-- 上述3个meta标签*必须*放在最前面，任何其他内容都*必须*跟随其后！ -->
    <title>欢迎使用</title>

    <!-- Bootstrap -->
    <link rel="stylesheet" href="/static/bootstrap/css/bootstrap.min.css">
    <script src="/static/jquery/jquery.min.js"></script>
    <script src="/static/underscore/underscore.js"></script>
     <script src="/static/leg-tree.js"></script>
    <script src="/static/bootstrap/js/bootstrap.min.js"></script>
    <link href="/static/bootstrap-table/dist/extensions/group-by-v2/bootstrap-table-group-by.css" rel="stylesheet">

    <link href="/static/bootstrap-table/dist/bootstrap-table.min.css" rel="stylesheet">
    <link href="/static/css/tree.css" rel="stylesheet">
    <script src="https://cdn.jsdelivr.net/npm/tableexport.jquery.plugin@1.10.21/tableExport.min.js"></script>
    
    <script src="/static/bootstrap-table/dist/bootstrap-table.min.js"></script>
    
    <script src="/static/bootstrap-table/dist/extensions/group-by-v2/bootstrap-table-group-by.min.js"></script>
<script src="static/bootstrap-table/dist/extensions/export/bootstrap-table-export.js"></script>

    <script src="static/bootstrap-table/dist/locale/bootstrap-table-zh-CN.js"></script>
    <style>
        [type="submit"] {
            cursor: pointer;
        }
        
        
        
    </style>
     
</head>

<body>
 
   
    
        <div class="row">
    <div class="col-md-4">
        
    <form action="/combine" method="post" enctype="multipart/form-data">
 
  <div class="form-group">
    <label for="file">上传周报Excel</label>
    <input type="file"  id="file" name="file" required accept=".xlsx" multiple>
    <p class="help-block">可以同时选择多个Excel.</p>
  </div>
        <a href="/static/X项目周报.xlsx" >demo1下载</a>
        <a href="/static/Y项目周报.xlsx" >demo2下载</a>
         <a href="/static/Z项目周报.xlsx" >demo3下载</a>
          <a href="/static/A项目周报.xlsx" >错误格式demo下载</a>
  <div class="checkbox">
       <label class="radio-inline">
      <input type="radio" name="mode" id="preview" required value="preview"> 
      预览sheets</label>
        <label class="radio-inline">
      <input type="radio" name="mode" id="first" required value="first"> 
      使用第一个sheet</label>
      <label class="radio-inline">
        <input type="radio" name="mode"  id="last" required value="last"> 
      使用最后一个sheet</label>
          {% if sheetDict %}
           <label class="radio-inline" style="float: right;">
        <input type="radio" name="mode"   id="custom" required value="custom"> 
     自定义在右边树中选择需要合并的sheet</label>
   
              {% end %}    
     
  </div>
    <input type="hidden" name="sheets" id="sheets">
  <button type="submit" class="btn btn-primary">提交</button>
       <a href="/dashboard" class="btn btn-default">返回</a>
</form>
        {% if  errorDict %}
           <table class="table table-hover">
      <thead>
        <tr>
        
          <th>出错文件名称</th>
          <th>sheet名称</th>
       
        </tr>
      </thead>
      <tbody>
         {% set  fail=False %}
         {% for excel,sheet_dict in errorDict.items() %}
           {% for sheet,failure in sheet_dict.items() %}
          {% if  failure %}
          {% set  fail=True %}
        <tr>
        
          <td>{{excel}}</td>
          <td>{{sheet}}</td>
        
        </tr>
          {% end %}
           {% end %}
             {% end %}
      </tbody>
    </table>
        
          {% if not fail %}
          <h2>全部文件检查后，格式都正确</h2>
         {% end %}
            {% end %}
         </div>
        <div class="col-md-4">
    
         {% if sheetDict %}
      
        <div class="tree">
              {% end %}
		</div>
         
      </div>
      <div class="col-md-4">
       
     {% if not isinstance(df,None.__class__)  %}
<table id="table" data-show-export="true" data-file-name="周报合并"
  data-group-by="true"
  data-group-by-field="项目名称"
  >
  <thead>
    <tr>
      
      
 
        <th data-field="项目名称">项目名称</th>
        <th data-field="责任人">责任人</th>
      <th data-field="实际工作量(h)">实际工作量(h)</th>
          <th data-field="文件来源">文件来源</th>
      <th data-field="sheet">sheet</th>
    </tr>
  </thead>
</table>
 
    <input type="hidden" value="{{df.to_json(orient='records')}}" id="result">
     
     {% end %}
    {% import json %}
    {% if sheetDict %}
         <input type="hidden" value="{{json.dumps(sheetDict)}}" id="sheetDict"> 
    {% end %}
    
        </div>
     
     
   
  </div>
   
    
    <script src="/static/js/combine.js?v=2.0"></script>

</body>

</html>

页面JavaScript代码combine.js

$(function () {
 
    
    
    if (!!$('#result')[0]) {
        var data = JSON.parse($('#result').val());
        $table = $('#table').bootstrapTable();
        
        $table.bootstrapTable('destroy').bootstrapTable({
        
        exportTypes: ['json', 'xml', 'csv', 'txt', 'sql', 'excel', 'pdf'],
        exportOptions:{fileName:'合并周报'},
      })
        
   
        
        
        
        console.log(data);
        $table.bootstrapTable('load', data);
    }
    if (!!$('#sheetDict')[0]) {
        var sheetDict = JSON.parse($('#sheetDict').val());
        var treeData = _.map(sheetDict, (v, k) => {
            return {
                "name": k,
                "open": true,
                "ico": 'iconfont icon-yonghu',
                "cascade": true,
                "id": k,
                "checked": false,
                "children": _.map(v, value => {
                    return {
                        "name": value,
                        "open": true,
                        "ico": 'iconfont icon-yonghu',
                        "id": "---" + k + '<>' + value,
                        "checked": false
                    }
                })
            }
        })




        var arrs = []


        $(".tree").tree(treeData, arrs);

    }
 



    $('form').submit(function () {
        var sheets = _.filter($.getCheckedNodes(), x => x.startsWith('---')).map(x=>x.replace('---',''));
        console.log(sheets);
        $('#sheets').val(sheets.join(','));
    });

})