读取Excel数据进行决策树算法分析

程裕强

发布于 2019-07-01 10:59:24

81800

代码可运行

文章被收录于专栏：大数据学习笔记大数据学习笔记

运行总次数：0

代码可运行

备注：本程序是工信部机器学习相关培训样例程序

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import unicodedata
import copy
from xlrd import open_workbook
from xlrd import XL_CELL_TEXT, XL_CELL_NUMBER, XL_CELL_DATE, XL_CELL_BOOLEAN
import numpy as np
from sklearn import tree

# visualize code
from sklearn.externals.six import StringIO
import pydotplus


def is_number(num):
    pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
    result = pattern.match(num)
    if result:
        return True
    else:
        try:    # 检查中文字表示的数字
            unicodedata.numeric(num)
            return True
        except (TypeError, ValueError):
            pass

        return False

def sheet_to_array(filename, sheet_number, first_col=0, last_col=None, header=True):
    """Return a feature array and label list from sheet in an Excel spreadsheet.
    Notes:
    0. The array is follow excel data and each column data type should be same.
    1. If first_col is 0 and last_col is None, then all columns will be used,
    2. If header is True, only one header row is assumed.
    3. All rows are loaded.
    4. first column is sequence number, last column is label, others are features column.
    """
    DEBUG = False
    # sheet
    book = open_workbook(filename)
    sheet0 = book.sheet_by_index(sheet_number)
    rows = sheet0.nrows     # 获得行数
    # cols
    if not last_col:
        last_col = sheet0.ncols   # 获得列数
    if first_col >= last_col:
        raise Exception("First column must be smaller than last column!")
    cols = [col for col in range(first_col, last_col)]
    # rows
    skip = 0
    header_names = []
    if header:      # 如果有标题行的处理
        skip = 1
        for item in sheet0.row(0):
            header_names.append(item.value)
    del header_names[0]
    del header_names[-1]
    print(header_names)
    #data = np.empty([len(cols), rows - skip])
    print('define: ',len(cols),rows - skip)
    data = [[ '' for i in range(len(cols) - 2)] for j in range(rows - skip)]       # 不使用numpy的array属性，自己构造含混合数据类型的二维列表
    #
    print(data)
    datatype = ['' for i in range(len(cols) - 2)]       # 保存每一列的数据属性
    for col,  cell in enumerate(sheet0.row(skip)[1:-1]):     # 获得第一行的数据列表
        datatype[col] = cell.ctype      # 改为记录类型的数字值
        # if cell.ctype == XL_CELL_NUMBER:
        #     datatype[col] = 'number'
        # elif cell.ctype == XL_CELL_TEXT:
        #     datatype[col] = 'text'
        # elif cell.ctype == XL_CELL_DATE:
        #     datatype[col] = 'date'
        # elif cell.ctype == XL_CELL_BOOLEAN:
        #     datatype[col] = 'bool'
    print(datatype)
    checktypediff = False       # 用于检查每一列是否存在不同于第一行的数据类型
    labellist = []

    for row in range(skip, sheet0.nrows):
        row_values = sheet0.row(row)
        # 需要去掉第一列：序号，最后一列：标签数据(单列出来)
        label = row_values[-1]
        features = row_values[1:-1]
        #print(row_values)
        #print(features, label)
        labellist.append(label.value)
        labeltype = label.ctype
        for col, cell in enumerate(features):
            if DEBUG and row < 2:
                print(row, col, cell.ctype, cell.value, ' -- ',row - skip, col - first_col, '\n')
            #if col in cols and cell.ctype == XL_CELL_NUMBER:
            if col in cols:
                if cell.ctype == XL_CELL_NUMBER:
                    data[row - skip][col - first_col] = cell.value
                elif cell.ctype == XL_CELL_TEXT:
                    if is_number(cell.value):
                        data[row - skip][col - first_col] = float(cell.value)
                    else:
                        data[row - skip][col - first_col] = cell.value.strip()  # try text first
                else:
                    data[row - skip][col - first_col] = cell.value    # try text first
                if datatype[col] != cell.ctype:
                    checktypediff = True
    #print(labellist)
    if checktypediff == True:
        print('发现某一列的数据存在不一致的数据类型！！！')
    #arraydata = np.array(data)      # 转换为数组类型
    return data, datatype, labellist, labeltype, header_names

# 把输入的列表自定义字符串列表，转换为对应不同数字的列表值，并产生匹配字典
def list_to_value(srcList):
    setSrcList = set(srcList)       # 转为集合，用于去重
    sndSrcList = list(setSrcList)   # 转换回列表格式
    outputDict    = dict(zip(sndSrcList, range(len(sndSrcList))))  # 建立内容与数字的匹配, 转为字典格式

    outputList = []
    for item in srcList:
        outputList.append(outputDict[item])
    return outputDict, outputList



if __name__ == '__main__':
    filename = "../dataset/appleororange-2.xlsx"
    #filename = "houseloan-data-2.xlsx"
    #filename = "playtennis-data-2.xlsx"
    SHEET = 0  # the sheet number being processed
    featuredata, featuredatatype, labellist, labeltype, header = sheet_to_array(filename, SHEET, header=True)
     # 这样的复制，才会产生新的数列
    rawfeaturedata = copy.deepcopy(featuredata)    
    print('Features:', featuredata)
    print('Feature Type:', featuredatatype)
    cols = len(featuredata[0])      # 获得二维数组的列数
    for j in range(cols):
        print(j, featuredatatype[j])
        if featuredatatype[j] == XL_CELL_TEXT:  # 是字符串类型，就进行数值匹配转换
            thiscollist = []
            for i in range(len(featuredata)):
                #print(featuredata[i][j], end= ', ')
                thiscollist.append(featuredata[i][j])
            print(thiscollist)
            colDict, colValues = list_to_value(thiscollist)
            print(colDict, colValues)
            for i in range(len(featuredata)):
                featuredata[i][j] = colValues[i]
    print('New Features:', featuredata)

    print('Labels: ', labellist, labeltype)
    labelDict, labelValues = list_to_value(labellist)
    print(labelDict)
    print(labelValues)

    data = np.array(featuredata)
    #test_idx = [55, 56, 57, 58, 59]
    test_idx = []

    # training data
    if len(test_idx) == 0:
        train_target = labelValues
        train_data   = data
    else:
        train_target = np.delete(labelValues, test_idx)
        train_data = np.delete(data, test_idx, axis=0)

    # testing data
    if len(test_idx) == 0:
        test_target = labelValues
        test_data = data
    else:
        test_target = labelValues[55::]
        test_data = data[test_idx]
    
    #决策树算法调用核心语句
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt.fit(train_data, train_target)
    # 预测结果
    print(test_target)
    print(dt.predict(test_data))

    dot_data = StringIO()
    print(labelDict.keys())
    tree.export_graphviz(dt ,
                            out_file = dot_data,
                            feature_names = header,                # header
                            class_names   = list(labelDict.keys()),      # label
                            filled   = True, rounded = True,
                            impurity = False)
    print(dot_data)

    #graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace('helvetica','"Microsoft YaHei"'))

    print(graph)
    outputfilename = filename+'-graph'
    graph.write_pdf(outputfilename+".pdf")
    graph.write_jpg(outputfilename+".jpg")

运行结果

runfile('D:/ai/py/dicision-tree.py', wdir='D:/ai/py')
['重量', '表皮光滑度']
define:  4 6
[['', ''], ['', ''], ['', ''], ['', ''], ['', ''], ['', '']]
[2, 1]
Features: [[140.0, '光滑'], [130.0, '光滑'], [150.0, '粗糙'], [170.0, '粗糙'], [150.0, '光滑'], [130.0, '粗糙']]
Feature Type: [2, 1]
0 2
1 1
['光滑', '光滑', '粗糙', '粗糙', '光滑', '粗糙']
{'光滑': 0, '粗糙': 1} [0, 0, 1, 1, 0, 1]
New Features: [[140.0, 0], [130.0, 0], [150.0, 1], [170.0, 1], [150.0, 0], [130.0, 1]]
Labels:  ['橘子', '橘子', '苹果', '苹果', '橘子', '橘子'] 1
{'苹果': 0, '橘子': 1}
[1, 1, 0, 0, 1, 1]
[1, 1, 0, 0, 1, 1]
[1 1 0 0 1 1]
dict_keys(['苹果', '橘子'])
<_io.StringIO object at 0x000001D93BBA00D8>
<pydotplus.graphviz.Dot object at 0x000001D93BC40080>

In [16]:

127行修改源数据文件，如下

filename = "../dataset/houseloan-data-2.xlsx"

156行修改

test_idx = [55, 56, 57, 58, 59]
#test_idx = []

重新执行

runfile('D:/ai/py/dicision-tree.py', wdir='D:/ai/py')
['年龄', '学历', '月收入', '婚姻状况', '是否有房产']
define:  7 60
[['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', '']]
[2, 1, 1, 1, 1]
Features: [[26.0, '本科', '低', '未婚', '否'], [24.0, '大专', '低', '已婚', '否'], [28.0, '硕士', '中', '未婚', '是'], [29.0, '本科', '低', '已婚', '是'], [29.0, '博士', '高', '已婚', '否'], [32.0, '本科', '中', '已婚', '是'], [26.0, '硕士', '中', '未婚', '否'], [33.0, '本科', '低', '未婚', '是'], [35.0, '研究生', '中', '已婚', '是'], [31.0, '大专', '中', '已婚', '否'], [30.0, '本科', '低', '未婚', '是'], [29.0, '研究生', '高', '已婚', '否'], [30.0, '大专', '中', '已婚', '否'], [34.0, '本科', '高', '未婚', '是'], [27.0, '本科', '高', '未婚', '是'], [26.0, '本科', '中', '已婚', '是'], [23.0, '大专', '低', '已婚', '否'], [30.0, '大专', '低', '未婚', '是'], [23.0, '博士', '低', '未婚', '否'], [25.0, '硕士', '中', '未婚', '是'], [26.0, '本科', '中', '已婚', '是'], [24.0, '本科', '中', '已婚', '否'], [32.0, '硕士', '中', '未婚', '否'], [34.0, '博士', '中', '已婚', '否'], [38.0, '大专', '中', '已婚', '是'], [35.0, '大专', '低', '未婚', '否'], [34.0, '本科', '低', '已婚', '否'], [32.0, '硕士', '低', '未婚', '是'], [30.0, '大专', '低', '未婚', '否'], [31.0, '本科', '低', '已婚', '是'], [40.0, '大专', '高', '未婚', '否'], [34.0, '大专', '中', '已婚', '否'], [45.0, '大专', '中', '已婚', '是'], [53.0, '大专', '高', '未婚', '是'], [34.0, '大专', '高', '已婚', '是'], [23.0, '本科', '高', '未婚', '否'], [35.0, '本科', '中', '已婚', '否'], [54.0, '大专', '高', '已婚', '是'], [65.0, '本科', '高', '未婚', '否'], [45.0, '大专', '中', '已婚', '是'], [34.0, '大专', '低', '未婚', '否'], [37.0, '本科', '中', '已婚', '否'], [42.0, '大专', '高', '已婚', '是'], [44.0, '本科', '中', '未婚', '是'], [28.0, '本科', '中', '已婚', '否'], [26.0, '本科', '中', '已婚', '是'], [25.0, '本科', '中', '已婚', '否'], [43.0, '博士', '高', '已婚', '否'], [48.0, '本科', '中', '未婚', '是'], [36.0, '硕士', '高', '未婚', '是'], [33.0, '大专', '中', '已婚', '是'], [30.0, '本科', '中', '已婚', '否'], [28.0, '研究生', '高', '未婚', '否'], [26.0, '研究生', '高', '已婚', '否'], [32.0, '硕士', '高', '已婚', '是'], [42.0, '本科', '中', '已婚', '否'], [47.0, '硕士', '高', '已婚', '否'], [45.0, '大专', '中', '未婚', '是'], [37.0, '本科', '中', '已婚', '是'], [35.0, '本科', '中', '已婚', '否']]
Feature Type: [2, 1, 1, 1, 1]
0 2
1 1
['本科', '大专', '硕士', '本科', '博士', '本科', '硕士', '本科', '研究生', '大专', '本科', '研究生', '大专', '本科', '本科', '本科', '大专', '大专', '博士', '硕士', '本科', '本科', '硕士', '博士', '大专', '大专', '本科', '硕士', '大专', '本科', '大专', '大专', '大专', '大专', '大专', '本科', '本科', '大专', '本科', '大专', '大专', '本科', '大专', '本科', '本科', '本科', '本科', '博士', '本科', '硕士', '大专', '本科', '研究生', '研究生', '硕士', '本科', '硕士', '大专', '本科', '本科']
{'研究生': 0, '大专': 1, '硕士': 2, '本科': 3, '博士': 4} [3, 1, 2, 3, 4, 3, 2, 3, 0, 1, 3, 0, 1, 3, 3, 3, 1, 1, 4, 2, 3, 3, 2, 4, 1, 1, 3, 2, 1, 3, 1, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 4, 3, 2, 1, 3, 0, 0, 2, 3, 2, 1, 3, 3]
2 1
['低', '低', '中', '低', '高', '中', '中', '低', '中', '中', '低', '高', '中', '高', '高', '中', '低', '低', '低', '中', '中', '中', '中', '中', '中', '低', '低', '低', '低', '低', '高', '中', '中', '高', '高', '高', '中', '高', '高', '中', '低', '中', '高', '中', '中', '中', '中', '高', '中', '高', '中', '中', '高', '高', '高', '中', '高', '中', '中', '中']
{'低': 0, '高': 1, '中': 2} [0, 0, 2, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2]
3 1
['未婚', '已婚', '未婚', '已婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '已婚', '已婚', '未婚', '已婚', '已婚']
{'未婚': 0, '已婚': 1} [0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1]
4 1
['否', '否', '是', '是', '否', '是', '否', '是', '是', '否', '是', '否', '否', '是', '是', '是', '否', '是', '否', '是', '是', '否', '否', '否', '是', '否', '否', '是', '否', '是', '否', '否', '是', '是', '是', '否', '否', '是', '否', '是', '否', '否', '是', '是', '否', '是', '否', '否', '是', '是', '是', '否', '否', '否', '是', '否', '否', '是', '是', '否']
{'否': 0, '是': 1} [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]
New Features: [[26.0, 3, 0, 0, 0], [24.0, 1, 0, 1, 0], [28.0, 2, 2, 0, 1], [29.0, 3, 0, 1, 1], [29.0, 4, 1, 1, 0], [32.0, 3, 2, 1, 1], [26.0, 2, 2, 0, 0], [33.0, 3, 0, 0, 1], [35.0, 0, 2, 1, 1], [31.0, 1, 2, 1, 0], [30.0, 3, 0, 0, 1], [29.0, 0, 1, 1, 0], [30.0, 1, 2, 1, 0], [34.0, 3, 1, 0, 1], [27.0, 3, 1, 0, 1], [26.0, 3, 2, 1, 1], [23.0, 1, 0, 1, 0], [30.0, 1, 0, 0, 1], [23.0, 4, 0, 0, 0], [25.0, 2, 2, 0, 1], [26.0, 3, 2, 1, 1], [24.0, 3, 2, 1, 0], [32.0, 2, 2, 0, 0], [34.0, 4, 2, 1, 0], [38.0, 1, 2, 1, 1], [35.0, 1, 0, 0, 0], [34.0, 3, 0, 1, 0], [32.0, 2, 0, 0, 1], [30.0, 1, 0, 0, 0], [31.0, 3, 0, 1, 1], [40.0, 1, 1, 0, 0], [34.0, 1, 2, 1, 0], [45.0, 1, 2, 1, 1], [53.0, 1, 1, 0, 1], [34.0, 1, 1, 1, 1], [23.0, 3, 1, 0, 0], [35.0, 3, 2, 1, 0], [54.0, 1, 1, 1, 1], [65.0, 3, 1, 0, 0], [45.0, 1, 2, 1, 1], [34.0, 1, 0, 0, 0], [37.0, 3, 2, 1, 0], [42.0, 1, 1, 1, 1], [44.0, 3, 2, 0, 1], [28.0, 3, 2, 1, 0], [26.0, 3, 2, 1, 1], [25.0, 3, 2, 1, 0], [43.0, 4, 1, 1, 0], [48.0, 3, 2, 0, 1], [36.0, 2, 1, 0, 1], [33.0, 1, 2, 1, 1], [30.0, 3, 2, 1, 0], [28.0, 0, 1, 0, 0], [26.0, 0, 1, 1, 0], [32.0, 2, 1, 1, 1], [42.0, 3, 2, 1, 0], [47.0, 2, 1, 1, 0], [45.0, 1, 2, 0, 1], [37.0, 3, 2, 1, 1], [35.0, 3, 2, 1, 0]]
Labels:  ['否', '否', '是', '否', '是', '是', '否', '否', '是', '是', '否', '是', '是', '是', '是', '是', '否', '是', '否', '是', '是', '是', '否', '否', '是', '否', '否', '否', '否', '否', '是', '否', '否', '是', '是', '是', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '否', '是', '是', '是', '是', '否', '是', '是', '是', '是', '是', '否', '是', '否'] 1
{'否': 0, '是': 1}
[0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0]
[1, 1, 0, 1, 0]
[0 1 0 1 0]
dict_keys(['否', '是'])
<_io.StringIO object at 0x000001D93BB73168>
<pydotplus.graphviz.Dot object at 0x000001D93B6EA8D0>