1. 数据集
数据集包括2014到2020年的信息,对close列进行预测。
2. Python代码
2.1 导入必要的包
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import Sequential
from sklearn.metrics import mean_squared_error
import numpy as np
import math
2.2 中文字体
from matplotlib.font_manager import FontProperties
kaiti = FontProperties("KaiTi", size=16) # 楷体字体
2.3 数据预处理
# 数据的差分转换
def difference(data_set,interval=1):
diff=list()
for i in range(interval,len(data_set)):
value=data_set[i]-data_set[i-interval]
diff.append(value)
return pd.Series(diff)
# 对预测的数据进行逆差分转换
def invert_difference(history,yhat,interval=1):
return yhat+history[-interval]
# 将数据转换为监督学习集,移位后产生的NaN值补0
def timeseries_to_supervised(data,lag=1):
df=pd.DataFrame(data)
columns=[df.shift(i) for i in range(1,lag+1)]
columns.append(df)
df=pd.concat(columns,axis=1)
df.fillna(0,inplace=True)
return df
# 将数据缩放到[-1,1]之间
def scale(train,test):
scaler=MinMaxScaler(feature_range=(-1,1))
scaler=scaler.fit(train)
train_scaled=scaler.transform(train)
test_scaled=scaler.transform(test)
return scaler,train_scaled,test_scaled
# 将预测值进行逆缩放,使用之前训练好的缩放器,x为一维数组,y为实数
def invert_scale(scaler,X,y):
new_row=[x for x in X]+[y]
array=np.array(new_row)
array=array.reshape(1,len(array))
invert=scaler.inverse_transform(array)
return invert[0,-1]
2.4 LSTM
# 构建一个LSTM模型
def fit_lstm(train,batch_size,nb_epoch,neurons):
X,y=train[:,0:-1],train[:,-1]
X=X.reshape(X.shape[0],1,X.shape[1])
model=Sequential()
model.add(LSTM(neurons,batch_input_shape=(batch_size,X.shape[1],X.shape[2]),stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam')
for i in range(nb_epoch):
his=model.fit(X,y,batch_size=batch_size,verbose=1,shuffle=False)
model.reset_states()
return model
# 单步预测
def forecast_lstm(model,batch_size,X):
X=X.reshape(1,1,len(X))
yhat=model.predict(X,batch_size=batch_size)
return yhat[0,0]
2.5 绘图函数
# 绘图函数
def plot_df(df, x, y, title="", xlabel='日期', ylabel='y', dpi=100):
plt.figure(figsize=(16,5), dpi=dpi)
plt.plot(x, y, color='red')
plt.title(title, fontproperties=kaiti)
plt.xlabel(xlabel, fontproperties=kaiti)
plt.ylabel(ylabel, fontproperties=kaiti)
plt.show()
2.6 导入数据
df_drug = pd.read_csv('data.csv', parse_dates=['date'], index_col='date', sep='\t')
print(df_drug.head())
plot_df(df_drug, x=df_drug.index, y=df_drug.close, title='时间序列',ylabel='y')
2.7 数据预处理并分出测试集
raw_value=df_drug.close
diff_value=difference(raw_value,1)
supervised=timeseries_to_supervised(diff_value,1)
supervised_value=supervised.values
testNum=200
train,test=supervised_value[:-testNum],supervised_value[-testNum:]
scaler,train_scaled,test_scaled=scale(train,test)
2.8 训练
# 构建一个LSTM模型并训练
lstm_model=fit_lstm(train_scaled,1,1,4)
# 单步预测
predictions=list()
for i in range(len(test_scaled)):
X,y=test[i,0:-1],test[i,-1]
yhat=forecast_lstm(lstm_model,1,X)
yhat=invert_scale(scaler,X,yhat)
yhat=invert_difference(raw_value,yhat,len(test_scaled)+1-i)
predictions.append(yhat)
2.9 计算误差并可视化
# 计算误差
mse=mean_squared_error(raw_value[:testNum],predictions)
re = sum(abs(raw_value[:testNum] - predictions) / raw_value[:testNum]) / len(predictions)
rmse = mse ** 0.5
print("Test MSE:",mse)
print("Test RMSE:",rmse)
print("Relative Error", re)
plt.plot(df_drug.index, raw_value)
plt.plot(df_drug.index[-testNum:], predictions)
plt.legend(['true','pred'])
plt.show()
3. 预测结果
3.1 已知数据
3.2 预测效果
3.3 预测误差
Test MSE: 722.0236296608875
Test RMSE: 26.870497383950443
Relative Error 4.436888940577301%