我想实现一个有连续值的决策树ID3算法,第一次输出的最优特征值没有问题,迭代的话就会报错,求帮忙
def load_data():
# Only include the first 8 descriptive features and the target label
data = pd.read_csv("heart.csv", usecols=[
"age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "target"]) #filter the features we are interested in
return data
def describe_partitions(ps):
for target, p in sorted(ps.items(), key=lambda k: k[0]):
print(f"{target}\t{p.shape[0]}")
print("")
# more frequently,higher entropy
def entropy(data):
counts = data["target"].value_counts()
# print(counts)
"""
Similar to doing the following manually:
counts = {}
for val in data["target"]:
counts[val] = counts.get(val, 0) + 1
"""
total = data["target"].shape[0]
# print(data)
sum = 0.
for count in counts:
p = count/total
sum += p * math.log(p)
return - sum
def partitions(data, feature, thresholds):
def find_threshold(feature, val):
# Guaranteed to find a threshold somewhere between min and max
for t in reversed(thresholds[feature]):
if val >= t:
return t
raise Exception("Unexpected return without threshold")
features = data.columns
ps = {}
for j, val in enumerate(data[feature]):
# Treat categorical and continuous feature values differently
if feature in thresholds:
val = find_threshold(feature, val)
p = ps.get(val, pd.DataFrame(columns=features))
ps[val] = p.append(data.loc[j, features])
return ps
def create_thresholds(data, names, nstds=3):
# Assume the data is normally-distributed
thresholds = {}
for feature in names:
col = data[feature]
mint, maxt = np.min(col), np.max(col)
mean, stddev = np.mean(col), np.std(col)
ts = [mint]
for n in range(-nstds - 1, nstds):
t = round(n * stddev + mean)
if t >= mint and t <= maxt:
ts.append(t)
thresholds[feature] = ts
return thresholds
def gain(data, H, feature, thresholds):
ps = partitions(data, feature, thresholds)
# describe_partitions(ps)
sum = 0.
for p in ps.values():
if feature in p.columns:
sum += (p.shape[0] / data.shape[0]) * entropy(p)
return H - sum
# return if the target_attr have only one value in data set
def unique(data):
value_count=data['target'].value_counts()
total=data['target'].shape[0]
if value_count[0]==total:
return data['target'].values[0],1
else:
return False,0
# return maximum occurence of a unique value of target_attr in data
def common_value(data):
c=data['target'].value_counts()
sc=sorted(c.iteritems(), key=operator.itemgetter(1), reverse=True)
return sc[0][0]
def best_attribute(data,attr,thresholds):
Gains=np.zeros(len(attr))
h=entropy(data)
for i,feature in enumerate(data.columns[attr]):
Gains[i]=gain(data,h,feature,thresholds)
best=data.columns[np.argmax(Gains)+1] #'+1' is because the first column is 'index',so the attribute features start at 1
return best,np.argmax(Gains)+1
def ID3(data,attr,thresholds):
tree={}
record,truth=unique(data)
# all examples have same value of attribute in the data
if truth!=0:
tree = record
# no more attributes to be considered
elif len(attr)==0:
# return tree with single node root labelled with most common value of target_attr in the data
tree=common_value(data)
else:
A,Anumber=best_attribute(data,attr,thresholds)
# print(A)
tree={A:{}}
values=data[A].values
attr.pop()
for vi in values:
#add subtree to tree
tree[A][vi]=ID3(data[data[A] == vi].drop(A, axis=1),attr,thresholds)
# print(examples)
return tree
def main():
data = load_data()
# Split into training and test data sets
train_data, test_data = train_test_split(data, test_size=0.25)
# Compute the total entropy for the full data set with respect to the target label
H = entropy(train_data)
print(f"Total Entropy: {H}")
# Generate threshold values for the continuous value descriptive features
thresholds = create_thresholds(
train_data, ["age", "chol", "trestbps", "thalach"], nstds=3)
# Compute the level=0 information gain when partitioned on each descriptive feature
IG = np.zeros(8)
for i, feature in enumerate(data.columns[:8]):
IG[i] = gain(train_data, H, feature, thresholds)
# Print the best one (at the level=0)
print(IG)
A=data.columns[np.argmax(IG)]
print("Best IG feature: "+A)
l=list(range(1,9))
# print(train_data)
DecisionTree=ID3(train_data,l,thresholds)
print(DecisionTree)