版权声明:本文为博主原创文章,欢迎转载。 https://cloud.tencent.com/developer/article/1453997
备注:本程序是工信部机器学习相关培训样例程序
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import unicodedata
import copy
from xlrd import open_workbook
from xlrd import XL_CELL_TEXT, XL_CELL_NUMBER, XL_CELL_DATE, XL_CELL_BOOLEAN
import numpy as np
from sklearn import tree
# visualize code
from sklearn.externals.six import StringIO
import pydotplus
def is_number(num):
pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
result = pattern.match(num)
if result:
return True
else:
try: # 检查中文字表示的数字
unicodedata.numeric(num)
return True
except (TypeError, ValueError):
pass
return False
def sheet_to_array(filename, sheet_number, first_col=0, last_col=None, header=True):
"""Return a feature array and label list from sheet in an Excel spreadsheet.
Notes:
0. The array is follow excel data and each column data type should be same.
1. If first_col is 0 and last_col is None, then all columns will be used,
2. If header is True, only one header row is assumed.
3. All rows are loaded.
4. first column is sequence number, last column is label, others are features column.
"""
DEBUG = False
# sheet
book = open_workbook(filename)
sheet0 = book.sheet_by_index(sheet_number)
rows = sheet0.nrows # 获得行数
# cols
if not last_col:
last_col = sheet0.ncols # 获得列数
if first_col >= last_col:
raise Exception("First column must be smaller than last column!")
cols = [col for col in range(first_col, last_col)]
# rows
skip = 0
header_names = []
if header: # 如果有标题行的处理
skip = 1
for item in sheet0.row(0):
header_names.append(item.value)
del header_names[0]
del header_names[-1]
print(header_names)
#data = np.empty([len(cols), rows - skip])
print('define: ',len(cols),rows - skip)
data = [[ '' for i in range(len(cols) - 2)] for j in range(rows - skip)] # 不使用numpy的array属性,自己构造含混合数据类型的二维列表
#
print(data)
datatype = ['' for i in range(len(cols) - 2)] # 保存每一列的数据属性
for col, cell in enumerate(sheet0.row(skip)[1:-1]): # 获得第一行的数据列表
datatype[col] = cell.ctype # 改为记录类型的数字值
# if cell.ctype == XL_CELL_NUMBER:
# datatype[col] = 'number'
# elif cell.ctype == XL_CELL_TEXT:
# datatype[col] = 'text'
# elif cell.ctype == XL_CELL_DATE:
# datatype[col] = 'date'
# elif cell.ctype == XL_CELL_BOOLEAN:
# datatype[col] = 'bool'
print(datatype)
checktypediff = False # 用于检查每一列是否存在不同于第一行的数据类型
labellist = []
for row in range(skip, sheet0.nrows):
row_values = sheet0.row(row)
# 需要去掉第一列:序号,最后一列:标签数据(单列出来)
label = row_values[-1]
features = row_values[1:-1]
#print(row_values)
#print(features, label)
labellist.append(label.value)
labeltype = label.ctype
for col, cell in enumerate(features):
if DEBUG and row < 2:
print(row, col, cell.ctype, cell.value, ' -- ',row - skip, col - first_col, '\n')
#if col in cols and cell.ctype == XL_CELL_NUMBER:
if col in cols:
if cell.ctype == XL_CELL_NUMBER:
data[row - skip][col - first_col] = cell.value
elif cell.ctype == XL_CELL_TEXT:
if is_number(cell.value):
data[row - skip][col - first_col] = float(cell.value)
else:
data[row - skip][col - first_col] = cell.value.strip() # try text first
else:
data[row - skip][col - first_col] = cell.value # try text first
if datatype[col] != cell.ctype:
checktypediff = True
#print(labellist)
if checktypediff == True:
print('发现某一列的数据存在不一致的数据类型!!!')
#arraydata = np.array(data) # 转换为数组类型
return data, datatype, labellist, labeltype, header_names
# 把输入的列表自定义字符串列表,转换为对应不同数字的列表值,并产生匹配字典
def list_to_value(srcList):
setSrcList = set(srcList) # 转为集合,用于去重
sndSrcList = list(setSrcList) # 转换回列表格式
outputDict = dict(zip(sndSrcList, range(len(sndSrcList)))) # 建立内容与数字的匹配, 转为字典格式
outputList = []
for item in srcList:
outputList.append(outputDict[item])
return outputDict, outputList
if __name__ == '__main__':
filename = "../dataset/appleororange-2.xlsx"
#filename = "houseloan-data-2.xlsx"
#filename = "playtennis-data-2.xlsx"
SHEET = 0 # the sheet number being processed
featuredata, featuredatatype, labellist, labeltype, header = sheet_to_array(filename, SHEET, header=True)
# 这样的复制,才会产生新的数列
rawfeaturedata = copy.deepcopy(featuredata)
print('Features:', featuredata)
print('Feature Type:', featuredatatype)
cols = len(featuredata[0]) # 获得二维数组的列数
for j in range(cols):
print(j, featuredatatype[j])
if featuredatatype[j] == XL_CELL_TEXT: # 是字符串类型,就进行数值匹配转换
thiscollist = []
for i in range(len(featuredata)):
#print(featuredata[i][j], end= ', ')
thiscollist.append(featuredata[i][j])
print(thiscollist)
colDict, colValues = list_to_value(thiscollist)
print(colDict, colValues)
for i in range(len(featuredata)):
featuredata[i][j] = colValues[i]
print('New Features:', featuredata)
print('Labels: ', labellist, labeltype)
labelDict, labelValues = list_to_value(labellist)
print(labelDict)
print(labelValues)
data = np.array(featuredata)
#test_idx = [55, 56, 57, 58, 59]
test_idx = []
# training data
if len(test_idx) == 0:
train_target = labelValues
train_data = data
else:
train_target = np.delete(labelValues, test_idx)
train_data = np.delete(data, test_idx, axis=0)
# testing data
if len(test_idx) == 0:
test_target = labelValues
test_data = data
else:
test_target = labelValues[55::]
test_data = data[test_idx]
#决策树算法调用核心语句
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(train_data, train_target)
# 预测结果
print(test_target)
print(dt.predict(test_data))
dot_data = StringIO()
print(labelDict.keys())
tree.export_graphviz(dt ,
out_file = dot_data,
feature_names = header, # header
class_names = list(labelDict.keys()), # label
filled = True, rounded = True,
impurity = False)
print(dot_data)
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace('helvetica','"Microsoft YaHei"'))
print(graph)
outputfilename = filename+'-graph'
graph.write_pdf(outputfilename+".pdf")
graph.write_jpg(outputfilename+".jpg")
运行结果
runfile('D:/ai/py/dicision-tree.py', wdir='D:/ai/py')
['重量', '表皮光滑度']
define: 4 6
[['', ''], ['', ''], ['', ''], ['', ''], ['', ''], ['', '']]
[2, 1]
Features: [[140.0, '光滑'], [130.0, '光滑'], [150.0, '粗糙'], [170.0, '粗糙'], [150.0, '光滑'], [130.0, '粗糙']]
Feature Type: [2, 1]
0 2
1 1
['光滑', '光滑', '粗糙', '粗糙', '光滑', '粗糙']
{'光滑': 0, '粗糙': 1} [0, 0, 1, 1, 0, 1]
New Features: [[140.0, 0], [130.0, 0], [150.0, 1], [170.0, 1], [150.0, 0], [130.0, 1]]
Labels: ['橘子', '橘子', '苹果', '苹果', '橘子', '橘子'] 1
{'苹果': 0, '橘子': 1}
[1, 1, 0, 0, 1, 1]
[1, 1, 0, 0, 1, 1]
[1 1 0 0 1 1]
dict_keys(['苹果', '橘子'])
<_io.StringIO object at 0x000001D93BBA00D8>
<pydotplus.graphviz.Dot object at 0x000001D93BC40080>
In [16]:
127行修改源数据文件,如下
filename = "../dataset/houseloan-data-2.xlsx"
156行修改
test_idx = [55, 56, 57, 58, 59]
#test_idx = []
重新执行
runfile('D:/ai/py/dicision-tree.py', wdir='D:/ai/py')
['年龄', '学历', '月收入', '婚姻状况', '是否有房产']
define: 7 60
[['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', '']]
[2, 1, 1, 1, 1]
Features: [[26.0, '本科', '低', '未婚', '否'], [24.0, '大专', '低', '已婚', '否'], [28.0, '硕士', '中', '未婚', '是'], [29.0, '本科', '低', '已婚', '是'], [29.0, '博士', '高', '已婚', '否'], [32.0, '本科', '中', '已婚', '是'], [26.0, '硕士', '中', '未婚', '否'], [33.0, '本科', '低', '未婚', '是'], [35.0, '研究生', '中', '已婚', '是'], [31.0, '大专', '中', '已婚', '否'], [30.0, '本科', '低', '未婚', '是'], [29.0, '研究生', '高', '已婚', '否'], [30.0, '大专', '中', '已婚', '否'], [34.0, '本科', '高', '未婚', '是'], [27.0, '本科', '高', '未婚', '是'], [26.0, '本科', '中', '已婚', '是'], [23.0, '大专', '低', '已婚', '否'], [30.0, '大专', '低', '未婚', '是'], [23.0, '博士', '低', '未婚', '否'], [25.0, '硕士', '中', '未婚', '是'], [26.0, '本科', '中', '已婚', '是'], [24.0, '本科', '中', '已婚', '否'], [32.0, '硕士', '中', '未婚', '否'], [34.0, '博士', '中', '已婚', '否'], [38.0, '大专', '中', '已婚', '是'], [35.0, '大专', '低', '未婚', '否'], [34.0, '本科', '低', '已婚', '否'], [32.0, '硕士', '低', '未婚', '是'], [30.0, '大专', '低', '未婚', '否'], [31.0, '本科', '低', '已婚', '是'], [40.0, '大专', '高', '未婚', '否'], [34.0, '大专', '中', '已婚', '否'], [45.0, '大专', '中', '已婚', '是'], [53.0, '大专', '高', '未婚', '是'], [34.0, '大专', '高', '已婚', '是'], [23.0, '本科', '高', '未婚', '否'], [35.0, '本科', '中', '已婚', '否'], [54.0, '大专', '高', '已婚', '是'], [65.0, '本科', '高', '未婚', '否'], [45.0, '大专', '中', '已婚', '是'], [34.0, '大专', '低', '未婚', '否'], [37.0, '本科', '中', '已婚', '否'], [42.0, '大专', '高', '已婚', '是'], [44.0, '本科', '中', '未婚', '是'], [28.0, '本科', '中', '已婚', '否'], [26.0, '本科', '中', '已婚', '是'], [25.0, '本科', '中', '已婚', '否'], [43.0, '博士', '高', '已婚', '否'], [48.0, '本科', '中', '未婚', '是'], [36.0, '硕士', '高', '未婚', '是'], [33.0, '大专', '中', '已婚', '是'], [30.0, '本科', '中', '已婚', '否'], [28.0, '研究生', '高', '未婚', '否'], [26.0, '研究生', '高', '已婚', '否'], [32.0, '硕士', '高', '已婚', '是'], [42.0, '本科', '中', '已婚', '否'], [47.0, '硕士', '高', '已婚', '否'], [45.0, '大专', '中', '未婚', '是'], [37.0, '本科', '中', '已婚', '是'], [35.0, '本科', '中', '已婚', '否']]
Feature Type: [2, 1, 1, 1, 1]
0 2
1 1
['本科', '大专', '硕士', '本科', '博士', '本科', '硕士', '本科', '研究生', '大专', '本科', '研究生', '大专', '本科', '本科', '本科', '大专', '大专', '博士', '硕士', '本科', '本科', '硕士', '博士', '大专', '大专', '本科', '硕士', '大专', '本科', '大专', '大专', '大专', '大专', '大专', '本科', '本科', '大专', '本科', '大专', '大专', '本科', '大专', '本科', '本科', '本科', '本科', '博士', '本科', '硕士', '大专', '本科', '研究生', '研究生', '硕士', '本科', '硕士', '大专', '本科', '本科']
{'研究生': 0, '大专': 1, '硕士': 2, '本科': 3, '博士': 4} [3, 1, 2, 3, 4, 3, 2, 3, 0, 1, 3, 0, 1, 3, 3, 3, 1, 1, 4, 2, 3, 3, 2, 4, 1, 1, 3, 2, 1, 3, 1, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 4, 3, 2, 1, 3, 0, 0, 2, 3, 2, 1, 3, 3]
2 1
['低', '低', '中', '低', '高', '中', '中', '低', '中', '中', '低', '高', '中', '高', '高', '中', '低', '低', '低', '中', '中', '中', '中', '中', '中', '低', '低', '低', '低', '低', '高', '中', '中', '高', '高', '高', '中', '高', '高', '中', '低', '中', '高', '中', '中', '中', '中', '高', '中', '高', '中', '中', '高', '高', '高', '中', '高', '中', '中', '中']
{'低': 0, '高': 1, '中': 2} [0, 0, 2, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2]
3 1
['未婚', '已婚', '未婚', '已婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '已婚', '已婚', '未婚', '已婚', '已婚']
{'未婚': 0, '已婚': 1} [0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1]
4 1
['否', '否', '是', '是', '否', '是', '否', '是', '是', '否', '是', '否', '否', '是', '是', '是', '否', '是', '否', '是', '是', '否', '否', '否', '是', '否', '否', '是', '否', '是', '否', '否', '是', '是', '是', '否', '否', '是', '否', '是', '否', '否', '是', '是', '否', '是', '否', '否', '是', '是', '是', '否', '否', '否', '是', '否', '否', '是', '是', '否']
{'否': 0, '是': 1} [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]
New Features: [[26.0, 3, 0, 0, 0], [24.0, 1, 0, 1, 0], [28.0, 2, 2, 0, 1], [29.0, 3, 0, 1, 1], [29.0, 4, 1, 1, 0], [32.0, 3, 2, 1, 1], [26.0, 2, 2, 0, 0], [33.0, 3, 0, 0, 1], [35.0, 0, 2, 1, 1], [31.0, 1, 2, 1, 0], [30.0, 3, 0, 0, 1], [29.0, 0, 1, 1, 0], [30.0, 1, 2, 1, 0], [34.0, 3, 1, 0, 1], [27.0, 3, 1, 0, 1], [26.0, 3, 2, 1, 1], [23.0, 1, 0, 1, 0], [30.0, 1, 0, 0, 1], [23.0, 4, 0, 0, 0], [25.0, 2, 2, 0, 1], [26.0, 3, 2, 1, 1], [24.0, 3, 2, 1, 0], [32.0, 2, 2, 0, 0], [34.0, 4, 2, 1, 0], [38.0, 1, 2, 1, 1], [35.0, 1, 0, 0, 0], [34.0, 3, 0, 1, 0], [32.0, 2, 0, 0, 1], [30.0, 1, 0, 0, 0], [31.0, 3, 0, 1, 1], [40.0, 1, 1, 0, 0], [34.0, 1, 2, 1, 0], [45.0, 1, 2, 1, 1], [53.0, 1, 1, 0, 1], [34.0, 1, 1, 1, 1], [23.0, 3, 1, 0, 0], [35.0, 3, 2, 1, 0], [54.0, 1, 1, 1, 1], [65.0, 3, 1, 0, 0], [45.0, 1, 2, 1, 1], [34.0, 1, 0, 0, 0], [37.0, 3, 2, 1, 0], [42.0, 1, 1, 1, 1], [44.0, 3, 2, 0, 1], [28.0, 3, 2, 1, 0], [26.0, 3, 2, 1, 1], [25.0, 3, 2, 1, 0], [43.0, 4, 1, 1, 0], [48.0, 3, 2, 0, 1], [36.0, 2, 1, 0, 1], [33.0, 1, 2, 1, 1], [30.0, 3, 2, 1, 0], [28.0, 0, 1, 0, 0], [26.0, 0, 1, 1, 0], [32.0, 2, 1, 1, 1], [42.0, 3, 2, 1, 0], [47.0, 2, 1, 1, 0], [45.0, 1, 2, 0, 1], [37.0, 3, 2, 1, 1], [35.0, 3, 2, 1, 0]]
Labels: ['否', '否', '是', '否', '是', '是', '否', '否', '是', '是', '否', '是', '是', '是', '是', '是', '否', '是', '否', '是', '是', '是', '否', '否', '是', '否', '否', '否', '否', '否', '是', '否', '否', '是', '是', '是', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '否', '是', '是', '是', '是', '否', '是', '是', '是', '是', '是', '否', '是', '否'] 1
{'否': 0, '是': 1}
[0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0]
[1, 1, 0, 1, 0]
[0 1 0 1 0]
dict_keys(['否', '是'])
<_io.StringIO object at 0x000001D93BB73168>
<pydotplus.graphviz.Dot object at 0x000001D93B6EA8D0>
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有