山东省第二届数字创新大赛-临沂赛场医保大数据分析

三更两点

发布于 2021-04-02 09:47:12

4070

发布于 2021-04-02 09:47:12

文章被收录于专栏：深度学习|机器学习|歌声合成|语音合成

数据处理以及分析

# -*- coding:utf-8 -*-
# /usr/bin/python
'''
-------------------------------------------------
   File Name   :  SummaryPlot.py
   Description :  
   Run Script  :  python  SummaryPlot.py
   Envs        :  python == 3.66
                  pip install  
   Date        :  2021/3/23  上午9:07
   CodeStyle   :  standard, simple, readable, maintainable, and portable!
-------------------------------------------------
   Change Activity:
          2021/3/23 : build
-------------------------------------------------
__Author__ = "Yan Errol 13075851954"
__Email__ = "260187357@qq.com"
__Copyright__ = "Copyright 2021, Yan Errol"
-------------------------------------------------
'''
import warnings
warnings.filterwarnings("ignore")
import time
import numpy as np
import pandas as pd
import json
from properties_util import Properties

class PlotSummary(object):
    '''画图和汇总'''
    def __init__(self,OutDir,DataDir,dataPath,properties):
        self.OutDir = OutDir
        self.DataDir = DataDir
        self.dataPath = dataPath
        self.dataDf = None
        self.properties = properties


    def readData(self):
        ''' 读数据'''
        start_time = time.time()
        self.dataDF = pd.read_csv(self.dataPath, iterator=False)


    def fill_nan(self,):
        '''
        空缺值填充
        :param data: all_data
        :param model: "0":空缺值固定填充0；mean：均值填充；ffill：前向填充；bfill：后向填充
        :return:data:空缺值填充后的数据
        '''
        if self.properties["fill_nan"] == "0":
            self.dataDF = self.dataDF.fillna(0)
        elif self.properties["fill_nan"] == "mean":
            self.dataDF = self.dataDF.fillna(self.dataDF.mean())
        elif self.properties["fill_nan"] == "ffill":
            self.dataDF =  self.dataDF.fillna(method="ffill")
        elif self.properties["fill_nan"] == "bfill":
            self.dataDF =  self.dataDF.fillna(method="bfill")
        else:
            print("Only four method！")

        for label in self.properties["label"].split(","):
            if label != "":
                uni = self.dataDF[label].unique()
                print("uni",uni)
                arrayunique = np.array(uni)
                np.save('../model/{}.npy'.format(label),arrayunique)

    def calCost(self,ClassDF):
        '''计算每个类别的费用'''
        ClassDF2016 = ClassDF[ClassDF['CYRQ']<=1483163999]  # 2016
        ClassDF2017 = ClassDF[(ClassDF['CYRQ'] > 1483163999) & (ClassDF['CYRQ'] < 1514736000)]  # 2017
        ClassDF2018 = ClassDF[ClassDF['CYRQ'] >= 1514736000]  # 2018

        ZFY_TCFY_dict = {}
        ZFY_TCFY_2016 = {}
        ZFY_TCFY_2016['ZFY']=ClassDF2016['ZFY'].sum()
        ZFY_TCFY_2016['TCFY']=ClassDF2016['TCFY'].sum()
        ZFY_TCFY_dict['2016']=ZFY_TCFY_2016

        ZFY_TCFY_2017 = {}
        ZFY_TCFY_2017['ZFY'] = ClassDF2017['ZFY'].sum()
        ZFY_TCFY_2017['TCFY'] = ClassDF2017['TCFY'].sum()
        ZFY_TCFY_dict['2017'] = ZFY_TCFY_2017

        ZFY_TCFY_2018 = {}
        ZFY_TCFY_2018['ZFY'] = ClassDF2018['ZFY'].sum()
        ZFY_TCFY_2018['TCFY'] = ClassDF2018['TCFY'].sum()
        ZFY_TCFY_dict['2018'] = ZFY_TCFY_2018
        return ZFY_TCFY_dict


    def Screening(self):
        '''筛选'''
        Screening_Dict = {}
        for label in self.properties['ScreeningLabel'].split(","):
            Screening_Dict[label] = {}
            if label != "":
                a = np.load('../model/{}.npy'.format(label),allow_pickle=True)
                uni = a.tolist()
                for Class in uni:
                    # Index = uni.index(Class)
                    # ClassDF = self.dataDF[self.dataDF[label] == Index]
                    ClassDF =  self.dataDF[self.dataDF[label]==Class]
                    print(ClassDF,type(ClassDF))
                    ZFY_TCFY_dict = self.calCost(ClassDF)
                    print("ZFY_TCFY_dict",ZFY_TCFY_dict)
                    Screening_Dict[label][Class] = ZFY_TCFY_dict
        print(Screening_Dict)
        with open('../dataSets/Screening_Dict.json', 'w') as f:
            json.dump(Screening_Dict, f,ensure_ascii=False)
        return Screening_Dict

    def monthlabel(self):
        '''分析每个医疗类别，每个月的交易情况'''
        for label in self.properties["timelabel"].split(","):
            self.dataDF[label+"s"] = pd.to_datetime(self.dataDF[label],unit="s")

        print(self.dataDF)
        for label in self.properties['monthLabel'].split(","):
            if label != "":
                a = np.load('../model/{}.npy'.format(label),allow_pickle=True)
                uni = a.tolist()

                for Class in uni:
                    ClassDF =  self.dataDF[self.dataDF[label]==Class]
                    ClassDF = ClassDF.set_index("CYRQs").resample("1M")
                    monthTransaction = ClassDF['ZFY', 'TCFY'].resample('M').sum()
                    print(Class,"-------\n",monthTransaction)#.resample('M').sum()
                    monthTransaction['CountPatients'] = ClassDF['RYLB'].resample('M').count()
                    print(monthTransaction)
        monthTransaction.to_csv("../dataSets/monthTransaction.csv")

    def CountPatients(self):
        '''统计年月日就诊人数'''
        dataDF1D =  self.dataDF.set_index("CYRQs").resample("1D")
        CountPatients1D = dataDF1D['RYLB'].resample("1D").count()
        print(CountPatients1D)
        CountPatients1D.to_csv("../dataSets/CountPatients1D.csv")

        dataDF1M = self.dataDF.set_index("CYRQs").resample("1M")
        CountPatients1M = dataDF1M['RYLB'].resample("1M").count()
        print(CountPatients1M)
        CountPatients1M.to_csv("../dataSets/CountPatients1M.csv")

        dataDF1Y = self.dataDF.set_index("CYRQs").resample("1Y")
        CountPatients1Y = dataDF1Y['RYLB'].resample("1Y").count()
        print(CountPatients1Y)
        CountPatients1Y.to_csv("../dataSets/CountPatients1Y.csv")

    def AgeDist(self):
        '''年龄分布'''
        age_bins = [20, 30, 40, 50, 60,70,80]
        age_labels = ['20-30岁', '31-40岁', '41-50岁', '51-60岁','61-70岁',"71-80岁"]
        self.dataDF['年龄分层'] = pd.cut(self.dataDF.NL, age_bins, labels=age_labels)
        aggResult = self.dataDF.groupby(by=['年龄分层'])['NL'].count()
        aggResult = pd.DataFrame(aggResult)
        aggResult["TCFY"] = self.dataDF.groupby(by=['年龄分层'])['TCFY'].sum()
        print(aggResult,type(aggResult))
        aggResult.to_csv("../dataSets/aggResult.csv")
        return aggResult

    def Place(self):
        '''地点处理'''
        self.dataDF.to_csv('../dataSets/ALLfinal.csv')
        with open(self.properties["dist_path"], 'r') as j:
            dist_label = json.loads(j.read())
            dist_label = dict(dist_label)
            print(dist_label,type(dist_label))

        for indexs in self.dataDF.index:
            for key, value in dist_label.items():
                if self.dataDF.loc[indexs,"RYQH"] in value:
                    self.dataDF.loc[indexs, "RYQH"] = key
            if self.dataDF.loc[indexs, "RYQH"] in self.dataDF.loc[indexs, "JZQH"]:
                self.dataDF.loc[indexs, "MedicalMigration"] = 0
            else:
                self.dataDF.loc[indexs, "MedicalMigration"] = 1

        print(self.dataDF[['RYQH', 'JZQH',"MedicalMigration"]])
        self.dataDF.to_csv('../dataSets/data001.csv')

if __name__ == "__main__":
    read_config = Properties("../config/dataAnsys.properties")
    properties = read_config.get_properties()
    print(properties)
    OutDir = "../Result"
    DataDir = "../dataSets"
    dataPath = "../dataSets/data001.csv"
    plotsummary = PlotSummary(OutDir,DataDir,dataPath,properties)
    plotsummary.readData()
    plotsummary.fill_nan()
    plotsummary.Screening()
    plotsummary.monthlabel()
    plotsummary.CountPatients()
    aggResult = plotsummary.AgeDist()
    plotsummary.Place()

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2021/04/01 ，如有侵权请联系 cloudcommunity@tencent.com 删除

数据处理

本文分享自作者个人站点/博客前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

数据处理

登录后参与评论

0 条评论

热度

山东省第二届数字创新大赛-临沂赛场医保大数据分析

山东省第二届数字创新大赛-临沂赛场医保大数据分析

数据处理以及分析

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

山东省第二届数字创新大赛-临沂赛场 医保大数据分析

山东省第二届数字创新大赛-临沂赛场 医保大数据分析

数据处理以及分析

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

山东省第二届数字创新大赛-临沂赛场医保大数据分析

山东省第二届数字创新大赛-临沂赛场医保大数据分析