您现在的位置是：首页 > 文章详情

python机器学习速成|1|数据导入

日期：2018-10-16点击：385收藏

主要任务：
①完成常见的数据导入操作，包括数据导入，缺失值填充
②完成常见的机器学习数据准备，包括特征二值化和训练集测试集的划分等

# -*- coding: utf-8 -*- """ Created on Wed Oct 17 00:26:22 2018 @author: Administrator """ %reset -f %clear # In[*] ## 第1步：导入库 #Day 1: Data Prepocessing #Step 1: Importing the libraries import numpy as np import pandas as pd import os os.chdir("E:\multi\ml\coad") # In[*] #Step 2: Importing dataset dataset = pd.read_csv('coad_messa.csv',header=0,index_col=0) X = dataset.iloc[ : , :-1].values Y = dataset.iloc[ : , 6].values # In[*] print("Step 2: Importing dataset") print("X") print(X) print("Y") print(Y)

这一步主要是导入数据，我们的前6列为用来预测的输入数据，包括gender， stage等等，我们将其设置为X，而输出数据，预测目标为患者的特征，可以是肿瘤或者正常等等，我们将其设置为Y。

 Step 2: Importing dataset X [[61. 0. 1. 1. 1. 1.] [67. 1. 3. 1. 2. 3.] [42. 0. 2. 2. 1. 1.] ... [44. 0. 2. 1. 2. 1.] [82. 1. 2. 1. 2. 1.] [52. 0. 2. 2. 1. 1.]] > Y [0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. > 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]

# In[*] #Step 3: Handling the missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0) imputer = imputer.fit(X[ : , 1:3]) X[ : , 1:3] = imputer.transform(X[ : , 1:3]) # In[*] print("---------------------") print("Step 3: Handling the missing data") print("step2") print("X") print(X) # In[*] #Step 4: Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[ : , 2] = labelencoder_X.fit_transform(X[ : , 2]) # In[*] #Creating a dummy variable onehotencoder = OneHotEncoder(categorical_features = [2]) X = onehotencoder.fit_transform(X).toarray() labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) # In[*] print("---------------------") print("Step 4: Encoding categorical data") print("X") print(X) print("Y") print(Y)

这一步主要是将其中的数据二值化，因为我们使用的数据包括性别，众所周知，性别是男性或者女性，虽然我们可以简单的将其设置为0和1或者将其设置为1,2.但是

对于一些特征工程方面，有时会用到LabelEncoder和OneHotEncoder。比如kaggle中对于性别，sex，一般的属性值是male和female。两个值。那么不靠谱的方法直接用0表示male，用1表示female 了。上面说了这是不靠谱的。所以要用one-hot编码。首先我们需要用LabelEncoder把sex这个属性列里面的离散属性用数字来表示，就是上面的过程，把male,female这种不同的字符的属性值，用数字表示。

# In[*] #Step 5: Splitting the datasets into training sets and Test sets from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0) # In[*] print("---------------------") print("Step 5: Splitting the datasets into training sets and Test sets") print("X_train") print(X_train) print("X_test") print(X_test) print("Y_train") print(Y_train) print("Y_test") print(Y_test) # In[*] #Step 6: Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # In[*] print("---------------------") print("Step 6: Feature Scaling") print("X_train") print(X_train) print("X_test") print(X_test)

最终我们将数据划分成训练集（80%）和测试集（20%）

原文链接：https://yq.aliyun.com/articles/655585

关注公众号

低调大师中文资讯倾力打造互联网数据资讯、行业资源、电子商务、移动互联网、网络营销平台。

持续更新报道IT业界、互联网、市场资讯、驱动更新,是最及时权威的产业资讯及硬件资讯报道平台。

转载内容版权归作者及来源网站所有，本站原创内容转载请注明来源。