# 数据预处理方法整理（数学建模）

## 数据清洗

``````import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 读取xls数据文件

### 合并多个文件数据

``````import glob,os

filenames_in = r'D:in'  # 输入文件的文件地址
filenames_out = r'D:inner'  # 新文件的地址
path_in  = r'D:in'
file_names = os.listdir(path_in)
file_paths = glob.glob(os.path.join(path_in,'*.csv'))
print(file_paths)

df1 = pd.DataFrame()
for file in file_paths:
#df2=df2.iloc[:,2]  #只取第三列
df1 = pd.concat([df1, df2], axis=0)      #axis=0意思是纵向拼接，=1的时候是横向拼接
print('dataframe的维度是：', df1.shape)
#print(df1)

# 输出数据到本地
df1.to_csv(r'D:innerresult.csv', index=False, sep=',')``````

### 填补缺失值

``````def fill_missing_values(df):
"""用DataFrame中各列的均值或众数来填补空值"""
for column in df:
if df[column].dtype == np.number:  # 如果数据是数字类型
mean = df[column].mean()
df[column].fillna(mean, inplace=True)  # 用均值填补空值
else:  # 如果数据不是数字类型
mode = df[column].mode().iloc[0]  # 找到最频繁出现的项
df[column].fillna(mode, inplace=True)  # 用众数填补空值

return df
data=fill_missing_values(data)``````

### 去除数据中的符号

``````import string
from zhon.hanzi import punctuation

punctuation_string = string.punctuation
for i in punctuation_string:
data= data.replace(i, '')
punctuation_str = punctuation
for i in punctuation_str:
data = data.replace(i, '')
``````

四万个数据不到一秒就全部填充好了，速度还是比较快的。然后直接把c列删除即可。

### 去除冗余数据

``````for col in data.columns:
# 如果这一列所有的值都相等
if data[col].nunique() == 1:
# 则删除这一列
data = data.drop(col, axis=1)``````

### 格式转换

``data['11'] = data['11'].astype(int)``

### 合并某几列数据

``````data['timestamp'] = data['月'].astype(str) + '-' + data['日'].astype(str) + '-' + data['具体时间']
data = data.drop(['月', '日', '具体时间'], axis=1)

# 将时间戳列设置为索引
#data.set_index('timestamp', inplace=True)

data['time'] = pd.to_datetime('2023-' + data['timestamp'], format='%Y-%m-%d-%H:%M:%S')``````

``````class_df = (data['ROLL_ATT1']+data['ROLL_ATT2'])/2
data['ROLL_ATT1']=class_df

### 数据可视化

``````import matplotlib.pyplot as plt
#画出所有变量随时间变化图像
feature = data.columns[1:]
for feas in feature:
plt.plot(data['time'], data[feas])
plt.xlabel('Time')
plt.ylabel(feas)
plt.show()``````

``````# 绘制分布图
plt.hist(df2['train1'], bins=20)
plt.xlabel('train1')
plt.ylabel('Frequency')
plt.title('Takeoff Weight Distribution')
plt.show()``````

散点图绘制

``````import seaborn as sns
sns.pairplot(data , hue ='label')
plt.savefig(r"D:pairplot001.png")``````

``````import seaborn as sns
sns.set(style="ticks")

sns.heatmap(data.corr(), annot=True, cmap="YlGnBu");
plt.savefig(r"D:heatmap.png")``````

## 针对机器学习及深度学习数据预处理

``````import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils, plot_model
import matplotlib.pyplot as pl
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D
from keras.models import model_from_json
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
import itertools``````

``````data2=data.drop(['label'],axis=1)

X = np.expand_dims(data2.astype(float), axis=2)
Y = data['label']

print(X.shape)
print(Y.shape)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=20)
x_valid, x_test, y_test, y_valid=train_test_split(X, Y, test_size=0.5, random_state=20)``````

``````standard = StandardScaler()

# 对训练集进行标准化，它会计算训练集的均值和标准差保存起来
x_train = standard.fit_transform(x_train)

# 使用标准化器在训练集上的均值和标准差，对测试集进行归一化
x_test = standard.transform(x_test)``````

``````from keras.utils import to_categorical
y_test = to_categorical(y_test)
y_train = to_categorical(y_train)

from keras import backend as K
K.set_image_dim_ordering("tf")
#one_hot编码转换
def one_hot(Train_Y, Test_Y):
Train_Y = np.array(Train_Y).reshape([-1, 1])
Test_Y = np.array(Test_Y).reshape([-1, 1])
Encoder = preprocessing.OneHotEncoder()
Encoder.fit(Train_Y)
Train_Y = Encoder.transform(Train_Y).toarray()
Test_Y = Encoder.transform(Test_Y).toarray()
Train_Y = np.asarray(Train_Y, dtype=np.int32)
Test_Y = np.asarray(Test_Y, dtype=np.int32)
return Train_Y, Test_Y

y_train, y_test = one_hot(y_train, y_test)``````

``````# 训练参数
batch_size = 128
epochs = 40 #训练轮数
num_classes = 6 #总共的训练类数
length = 2048
BatchNorm = False # 是否批量归一化
number = 1000 # 每类样本的数量
normal = False # 是否标准化``````

重塑训练参数，否则传入模型时会出错

``````x_train=x_train.reshape((x_train.shape[0],x_train.shape[1],1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1],1))``````

``````# # 改变dataset的大小，变成batch_size的倍数
def change_dataset_size(x, y, batch_size):
length = len(x)
if (length % batch_size != 0):
remainder = length % batch_size
x = x[:(length - remainder)]
y = y[:(length - remainder)]
return x, y

x_train,y_train=change_dataset_size(x_train,y_train,batch_size)

x_valid, y_valid=change_dataset_size(x_valid, y_valid,batch_size)``````

THE END

)">