2012年-2022年的黄金成交价格
数据说明 Date-日期 Close/Last-收盘价 Volume-成交量 Open-开盘价 High-最高价 Low-最低价
Date 0 Close/Last 0 Volume 39 Open 0 High 0 Low 0 dtype: int64
In [1]:
!pip install pyecharts -i https://pypi.mirrors.ustc.edu.cn/simple
In [2]:
import pandas as pd
from pyecharts.charts import *
from pyecharts.charts import Bar
from pyecharts.charts import Map
from pyecharts.charts import Pie
from pyecharts.charts import Line
from pyecharts.charts import Timeline
from pyecharts import options as opts
import warnings
warnings.filterwarnings('ignore')
In [3]:
#读取数据
df = pd.read_csv('/home/mw/input/data2152/gold_data.csv')
df.head(10)
Out[3]:
Date | Close/Last | Volume | Open | High | Low | |
---|---|---|---|---|---|---|
0 | 10/28/2022 | 1648.3 | 186519.0 | 1667.2 | 1670.9 | 1640.7 |
1 | 10/27/2022 | 1668.8 | 180599.0 | 1668.8 | 1674.8 | 1658.5 |
2 | 10/26/2022 | 1669.2 | 183453.0 | 1657.7 | 1679.4 | 1653.8 |
3 | 10/25/2022 | 1658.0 | 178706.0 | 1654.5 | 1666.8 | 1641.2 |
4 | 10/24/2022 | 1654.1 | 167448.0 | 1662.9 | 1675.5 | 1648.0 |
5 | 10/21/2022 | 1656.3 | 265985.0 | 1632.4 | 1663.1 | 1621.1 |
6 | 10/20/2022 | 1636.8 | 159797.0 | 1634.6 | 1650.3 | 1626.3 |
7 | 10/19/2022 | 1634.2 | 172551.0 | 1657.2 | 1659.8 | 1632.2 |
8 | 10/18/2022 | 1655.5 | NaN | 1655.5 | 1655.5 | 1655.5 |
9 | 10/17/2022 | 1664.0 | 144374.0 | 1649.9 | 1674.3 | 1649.1 |
In [4]:
df.tail(10)
Out[4]:
Date | Close/Last | Volume | Open | High | Low | |
---|---|---|---|---|---|---|
2537 | 11/13/2012 | 1724.8 | 132265.0 | 1729.0 | 1733.3 | 1717.6 |
2538 | 11/12/2012 | 1730.9 | 100825.0 | 1731.8 | 1738.0 | 1725.2 |
2539 | 11/09/2012 | 1730.9 | 137819.0 | 1731.5 | 1739.4 | 1726.9 |
2540 | 11/08/2012 | 1726.0 | 147342.0 | 1714.8 | 1735.1 | 1712.6 |
2541 | 11/07/2012 | 1714.0 | 241179.0 | 1715.8 | 1733.0 | 1703.0 |
2542 | 11/06/2012 | 1715.0 | 163585.0 | 1685.4 | 1720.9 | 1683.5 |
2543 | 11/05/2012 | 1683.2 | 109647.0 | 1676.7 | 1686.2 | 1672.5 |
2544 | 11/02/2012 | 1675.2 | 205777.0 | 1715.6 | 1717.2 | 1674.8 |
2545 | 11/01/2012 | 1715.5 | 105904.0 | 1720.4 | 1727.5 | 1715.1 |
2546 | 10/31/2012 | 1719.1 | 110928.0 | 1710.3 | 1726.6 | 1709.8 |
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2547 non-null object
1 Close/Last 2547 non-null float64
2 Volume 2508 non-null float64
3 Open 2547 non-null float64
4 High 2547 non-null float64
5 Low 2547 non-null float64
dtypes: float64(5), object(1)
memory usage: 119.5+ KB
In [6]:
#更改时间日期格式
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%Y")
df.head()
Out[6]:
Date | Close/Last | Volume | Open | High | Low | |
---|---|---|---|---|---|---|
0 | 2022-10-28 | 1648.3 | 186519.0 | 1667.2 | 1670.9 | 1640.7 |
1 | 2022-10-27 | 1668.8 | 180599.0 | 1668.8 | 1674.8 | 1658.5 |
2 | 2022-10-26 | 1669.2 | 183453.0 | 1657.7 | 1679.4 | 1653.8 |
3 | 2022-10-25 | 1658.0 | 178706.0 | 1654.5 | 1666.8 | 1641.2 |
4 | 2022-10-24 | 1654.1 | 167448.0 | 1662.9 | 1675.5 | 1648.0 |
In [7]:
#返回每一列缺失值统计
print(df.isnull().sum())
df['Volume'].fillna(method='bfill', inplace=True) # 使用前一个有效值填补
print(df.isnull().sum())
Date 0
Close/Last 0
Volume 39
Open 0
High 0
Low 0
dtype: int64
Date 0
Close/Last 0
Volume 0
Open 0
High 0
Low 0
dtype: int64
In [8]:
df.duplicated().sum()
0
In [9]:
#k线图
# 提取需要用于绘图的数据
df['Date'] = pd.to_datetime(df['Date'])
date = df['Date']
open_price = df['Open']
close_price = df['Close/Last']
high_price = df['High']
low_price = df['Low']
# 设置日期范围
start_date = '2012-10-28'
end_date = '2022-10-28'
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
df_filtered = df_filtered.sort_values('Date', ascending=True) # 按日期升序排列
kline = (
Kline()
.add_xaxis(df_filtered['Date'].tolist())
.add_yaxis('K线图', [[o, c, l, h] for o, c, l, h in zip(df_filtered['Open'], df_filtered['Close/Last'], df_filtered['Low'], df_filtered['High'])])
.set_global_opts(
title_opts=opts.TitleOpts(title="黄金价格k线图"),
xaxis_opts=opts.AxisOpts(type_="category"),
yaxis_opts=opts.AxisOpts(name="价格"),
)
)
kline.render_notebook()
Out[9]:
In [10]:
# 创建Bar对象
bar = (
Bar()
.add_xaxis(df_filtered['Date'].tolist())
.add_yaxis("成交量", df_filtered['Volume'].tolist())
.set_global_opts(
xaxis_opts=opts.AxisOpts(
type_="category",
axislabel_opts=opts.LabelOpts(rotate=-45),
is_scale=True, # 开启拖拽缩放
boundary_gap=True, # 设置坐标轴两端留白
),
yaxis_opts=opts.AxisOpts(name="成交量"),
title_opts=opts.TitleOpts(title="黄金价格成交量柱状图"),
toolbox_opts=opts.ToolboxOpts(is_show=True), # 显示工具栏
datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")], # 添加数据缩放组件
)
)
bar.render_notebook()
Out[10]:
In [11]:
# 计算移动平均线
ma5 = df_filtered['Close/Last'].rolling(window=5).mean()
ma20 = df_filtered['Close/Last'].rolling(window=20).mean()
# 将移动平均线数据添加到DataFrame
df_filtered['MA5'] = ma5
df_filtered['MA20'] = ma20
# 创建Line对象
line = (
Line()
.add_xaxis(df_filtered['Date'].tolist())
.add_yaxis("收盘价", df_filtered['Close/Last'].tolist())
.add_yaxis("5日移动平均线", df_filtered['MA5'].tolist())
.add_yaxis("20日移动平均线", df_filtered['MA20'].tolist())
.set_global_opts(
xaxis_opts=opts.AxisOpts(
type_="category",
axislabel_opts=opts.LabelOpts(rotate=-45),
is_scale=True, # 开启拖拽缩放
boundary_gap=True, # 设置坐标轴两端留白
),
yaxis_opts=opts.AxisOpts(name="价格"),
title_opts=opts.TitleOpts(title="黄金价格移动平均线图"),
toolbox_opts=opts.ToolboxOpts(is_show=True), # 显示工具栏
datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")], # 添加数据缩放组件
)
)
line.render_notebook()
Out[11]:
In [12]:
# 筛选出2012-2018年的数据作为训练集
start_date = '2012-10-28'
end_date = '2019-12-31'
train_df = df_filtered[(df_filtered['Date'] >= start_date) & (df_filtered['Date'] <= end_date)]
# 剩下的数据作为测试集
test_df = df_filtered[df_filtered['Date'] > end_date]
train_df.shape,test_df.shape
Out[12]:
((1834, 8), (713, 8))
In [13]:
df_filtered
Out[13]:
Date | Close/Last | Volume | Open | High | Low | MA5 | MA20 | |
---|---|---|---|---|---|---|---|---|
2546 | 2012-10-31 | 1719.1 | 110928.0 | 1710.3 | 1726.6 | 1709.8 | NaN | NaN |
2545 | 2012-11-01 | 1715.5 | 105904.0 | 1720.4 | 1727.5 | 1715.1 | NaN | NaN |
2544 | 2012-11-02 | 1675.2 | 205777.0 | 1715.6 | 1717.2 | 1674.8 | NaN | NaN |
2543 | 2012-11-05 | 1683.2 | 109647.0 | 1676.7 | 1686.2 | 1672.5 | NaN | NaN |
2542 | 2012-11-06 | 1715.0 | 163585.0 | 1685.4 | 1720.9 | 1683.5 | 1701.60 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4 | 2022-10-24 | 1654.1 | 167448.0 | 1662.9 | 1675.5 | 1648.0 | 1647.38 | 1675.785 |
3 | 2022-10-25 | 1658.0 | 178706.0 | 1654.5 | 1666.8 | 1641.2 | 1647.88 | 1676.875 |
2 | 2022-10-26 | 1669.2 | 183453.0 | 1657.7 | 1679.4 | 1653.8 | 1654.88 | 1676.835 |
1 | 2022-10-27 | 1668.8 | 180599.0 | 1668.8 | 1674.8 | 1658.5 | 1661.28 | 1676.845 |
0 | 2022-10-28 | 1648.3 | 186519.0 | 1667.2 | 1670.9 | 1640.7 | 1659.68 | 1675.660 |
2547 rows × 8 columns
In [14]:
from pycaret.regression import *
#创建回归模型
reg1 = setup(data = train_df,
session_id=23,
target = 'Close/Last', #指定目标列
categorical_features=["MA5","MA20"], #显式指定类别量的行
feature_selection=True,feature_selection_threshold=0.8, #去除次要特征
normalize = True
)
Setup Succesfully Completed.
Description | Value | |
---|---|---|
0 | session_id | 23 |
1 | Transform Target | False |
2 | Transform Target Method | None |
3 | Original Data | (1834, 8) |
4 | Missing Values | True |
5 | Numeric Features | 4 |
6 | Categorical Features | 2 |
7 | Ordinal Features | False |
8 | High Cardinality Features | False |
9 | High Cardinality Method | None |
10 | Sampled Data | (1834, 8) |
11 | Transformed Train Set | (1283, 3579) |
12 | Transformed Test Set | (551, 3579) |
13 | Numeric Imputer | mean |
14 | Categorical Imputer | constant |
15 | Normalize | True |
16 | Normalize Method | zscore |
17 | Transformation | False |
18 | Transformation Method | None |
19 | PCA | False |
20 | PCA Method | None |
21 | PCA Components | None |
22 | Ignore Low Variance | False |
23 | Combine Rare Levels | False |
24 | Rare Level Threshold | None |
25 | Numeric Binning | False |
26 | Remove Outliers | False |
27 | Outliers Threshold | None |
28 | Remove Multicollinearity | False |
29 | Multicollinearity Threshold | None |
30 | Clustering | False |
31 | Clustering Iteration | None |
32 | Polynomial Features | False |
33 | Polynomial Degree | None |
34 | Trignometry Features | False |
35 | Polynomial Threshold | None |
36 | Group Features | False |
37 | Feature Selection | True |
38 | Features Selection Threshold | 0.800000 |
39 | Feature Interaction | False |
40 | Feature Ratio | False |
41 | Interaction Threshold | None |
In [15]:
#预选模型分别为决策树和三个梯度提升决策树模型
compare_models(sort='RMSE',whitelist=['dt','xgboost','lightgbm','catboost'])
Model | MAE | MSE | RMSE | R2 | RMSLE | MAPE | TT (Sec) | |
---|---|---|---|---|---|---|---|---|
0 | Light Gradient Boosting Machine | 4.6107 | 41.9748 | 6.4126 | 0.9976 | 0.0049 | 0.0035 | 0.1476 |
1 | CatBoost Regressor | 5.3928 | 54.5599 | 7.3295 | 0.9968 | 0.0056 | 0.0041 | 21.6438 |
2 | Extreme Gradient Boosting | 5.4444 | 59.4263 | 7.6545 | 0.9965 | 0.0058 | 0.0042 | 12.8132 |
3 | Decision Tree | 5.7041 | 71.2734 | 8.3253 | 0.9959 | 0.0064 | 0.0044 | 0.1292 |
Out[15]:
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
importance_type='split', learning_rate=0.1, max_depth=-1,
min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
random_state=23, reg_alpha=0.0, reg_lambda=0.0, silent=True,
subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
In [20]:
#dt评分是最高,因此选择创建一个dt模型
LGBM = create_model("lightgbm")
MAE | MSE | RMSE | R2 | RMSLE | MAPE | |
---|---|---|---|---|---|---|
0 | 3.8260 | 27.3029 | 5.2252 | 0.9985 | 0.0040 | 0.0029 |
1 | 4.6190 | 38.5253 | 6.2069 | 0.9971 | 0.0048 | 0.0036 |
2 | 4.8288 | 51.1772 | 7.1538 | 0.9978 | 0.0053 | 0.0036 |
3 | 4.6855 | 44.5358 | 6.6735 | 0.9968 | 0.0051 | 0.0036 |
4 | 5.3560 | 67.1921 | 8.1971 | 0.9965 | 0.0063 | 0.0041 |
5 | 4.0250 | 27.0699 | 5.2029 | 0.9984 | 0.0040 | 0.0031 |
6 | 4.7729 | 45.2571 | 6.7273 | 0.9968 | 0.0051 | 0.0037 |
7 | 4.8022 | 41.2346 | 6.4214 | 0.9977 | 0.0051 | 0.0038 |
8 | 4.1472 | 27.7615 | 5.2689 | 0.9985 | 0.0040 | 0.0032 |
9 | 5.0448 | 49.6920 | 7.0493 | 0.9975 | 0.0052 | 0.0038 |
Mean | 4.6107 | 41.9748 | 6.4126 | 0.9976 | 0.0049 | 0.0035 |
SD | 0.4510 | 12.0390 | 0.9236 | 0.0007 | 0.0007 | 0.0003 |
In [21]:
# 通过tune_model处理模型对象,对其超参数进行调整
tuned_LGBM = tune_model(LGBM)
MAE | MSE | RMSE | R2 | RMSLE | MAPE | |
---|---|---|---|---|---|---|
0 | 3.8391 | 28.4661 | 5.3354 | 0.9984 | 0.0041 | 0.0030 |
1 | 4.8190 | 40.2126 | 6.3413 | 0.9970 | 0.0049 | 0.0037 |
2 | 4.7544 | 52.0082 | 7.2117 | 0.9978 | 0.0053 | 0.0036 |
3 | 5.0559 | 50.0589 | 7.0752 | 0.9964 | 0.0055 | 0.0039 |
4 | 5.2125 | 70.2318 | 8.3804 | 0.9964 | 0.0065 | 0.0040 |
5 | 4.1852 | 31.0530 | 5.5725 | 0.9982 | 0.0043 | 0.0032 |
6 | 4.8954 | 48.2966 | 6.9496 | 0.9966 | 0.0053 | 0.0038 |
7 | 4.9024 | 44.2247 | 6.6502 | 0.9975 | 0.0053 | 0.0038 |
8 | 4.2562 | 29.3606 | 5.4185 | 0.9984 | 0.0041 | 0.0033 |
9 | 5.1730 | 50.9983 | 7.1413 | 0.9974 | 0.0053 | 0.0039 |
Mean | 4.7093 | 44.4911 | 6.6076 | 0.9974 | 0.0051 | 0.0036 |
SD | 0.4372 | 12.2131 | 0.9113 | 0.0008 | 0.0007 | 0.0003 |
In [22]:
# 在之前划分的数据集上进行预测
predictions = predict_model(tuned_LGBM, data = test_df)predictions
Out[22]:
Date | Close/Last | Volume | Open | High | Low | MA5 | MA20 | Label | |
---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-02 | 1528.1 | 257867.0 | 1521.0 | 1534.0 | 1519.7 | 1519.74 | 1487.600 | 1537.2635 |
1 | 2020-01-03 | 1549.2 | 107.0 | 1530.1 | 1552.7 | 1530.1 | 1526.70 | 1491.050 | 1545.2156 |
2 | 2020-01-06 | 1568.8 | 539023.0 | 1562.7 | 1590.9 | 1562.3 | 1536.84 | 1495.335 | 1571.5318 |
3 | 2020-01-07 | 1574.3 | 418145.0 | 1567.4 | 1579.2 | 1557.0 | 1547.98 | 1501.095 | 1565.3638 |
4 | 2020-01-08 | 1560.2 | 787217.0 | 1556.7 | 1563.8 | 1556.5 | 1556.12 | 1505.860 | 1564.4905 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
708 | 2022-10-24 | 1654.1 | 167448.0 | 1662.9 | 1675.5 | 1648.0 | 1647.38 | 1675.785 | 1664.3066 |
709 | 2022-10-25 | 1658.0 | 178706.0 | 1654.5 | 1666.8 | 1641.2 | 1647.88 | 1676.875 | 1645.5911 |
710 | 2022-10-26 | 1669.2 | 183453.0 | 1657.7 | 1679.4 | 1653.8 | 1654.88 | 1676.835 | 1671.2611 |
711 | 2022-10-27 | 1668.8 | 180599.0 | 1668.8 | 1674.8 | 1658.5 | 1661.28 | 1676.845 | 1663.3458 |
712 | 2022-10-28 | 1648.3 | 186519.0 | 1667.2 | 1670.9 | 1640.7 | 1659.68 | 1675.660 | 1648.0580 |
713 rows × 9 columns
In [24]:
# 提取"Date"、"Close/Last"和"Label"列的数据
dates = predictions['Date'].tolist()
close_last = predictions['Close/Last'].tolist()
label = predictions['Label'].tolist()
# 创建折线图对象
line_chart = Line()
# 添加"Close/Last"数据线
line_chart.add_xaxis(dates)
line_chart.add_yaxis("Close/Last", close_last)
# 添加"Label"数据线
line_chart.add_yaxis("Label", label)
# 设置全局配置
line_chart.set_global_opts(
title_opts=opts.TitleOpts(title="Close/Last and Label Line Chart"),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
yaxis_opts=opts.AxisOpts(name='Value'),
)
# 渲染图表并保存为html文件
line_chart.render_notebook()
import numpy as np
label = np.array(predictions["Label"])
target = np.array(test_df["Close/Last"])
error = label - target
# 计算均方根误差RMSE
RMSE = np.nanmean((error**2))**0.5
# 计算平均绝对误差MAE
MAE = np.nanmean(abs(error))
print('RMSE:',RMSE)
print('MAE:',MAE)
RMSE: 122.60895426490625
MAE: 95.08966605890605
1.使用MA5和MA20作为参数的预测效果一般,仅在test时间的开头和结尾较为拟合
2.RMSE和MAE较大,采用其他参数或者使用其他算法可能获得更好的结果
3.本次compare model仅在决策树算法中挑选,可能决策树算法不太适合价格预测。