前言

本文章用于收录本学期所有机器学习代码, 欢迎大家查看

如代码有误可联系我!

本人会在最快的时间内把Bug修复

2023/2/23

归一化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np
from sklearn.neighbors import KNeighborsClassifier as knn
ar_x = [[4032, 1680, 1450, 5.3, 5.6],
[4330, 1535, 1885, 7.8, 14.5],
[4053, 1740, 1449, 6.2, 10.8],
[5087, 1868, 1500, 8.5, 25.6],
[4560, 1822, 1645, 7.8, 15.8],
[3797, 1510, 1820, 5.5, 9.6]]
ar_y = [0, 1, 0, 1]
ar_yy = [0, 1, 0, 1, 1, 0]
ar_min = np.min(ar_x, 0)
print(ar_min)
print(np.max(ar_x, 1))

ar_mn = np.max(ar_x, 0) - ar_min
print(ar_mn)

nor_ar = np.around((ar_x - ar_min) / ar_mn, 3)
print(nor_ar)

model = knn(n_neighbors = 3)
model.fit(nor_ar[:4], ar_y)
pre = model.predict(nor_ar[4:6])
print(pre)

2023/2/23

红酒

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split

wine_dataset = datasets.load_wine()# 可以导入datasets,也可以直接导入load_wine
print("红酒数据集中的键为:\n",wine_dataset.keys())

print("数据概况:",wine_dataset['data'].shape)

X_train, X_test, y_train, y_test = train_test_split(wine_dataset['data'],wine_dataset['target'], test_size=0.25,random_state = 0)
print('X_train shape:',X_train.shape,'\n','X_test shape:', X_test.shape)
print('y_train shape:',y_train.shape,'\n','y_test shape:', y_test.shape)

knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

print('模型评分:{:.2f}'.format(knn.score(X_test, y_test)))
print('模型评分2:{:.2f}'.format(knn.score(X_train, y_train)))

X_new = np.array([[13.2, 2.77, 2.51, 18.5, 96.6, 1.04, 2.55, 0.57, 1.47, 6.2, 1.05, 3.88, 820]])
result = knn.predict(X_new)
print("预测红酒的分类为:", wine_dataset['target_names'][result])

归一化处理后的

1
2
3
4
5
6
7
8
9
10
11
#利用归一化处理数据
X, y = wine_dataset['data'], wine_dataset['data']
X_min = np.min(X, 0)
X_mn = np.max(X, 0) - X_min
nor_X = np.around((X - X_min) / X_mn, 4)

X_train, X_test, y_train, y_test = train_test_split(nor_X,wine_dataset['target'], test_size=0.25,random_state = 0)
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print('模型评分:{:.2f}'.format(knn.score(X_test, y_test)))
print('模型评分2:{:.2f}'.format(knn.score(X_train, y_train)))

鸢尾花

1
2
3
4
5
6
7
8
9
10
11
12
13
#鸢尾花
from sklearn.datasets import load_iris
iris_data = load_iris()
X, y = iris_data['data'], iris_data['data']
X_min = np.min(X, 0)
X_mn = np.max(X, 0) - X_min
nor_X = np.around((X - X_min) / X_mn, 4)

X_train, X_test, y_train, y_test = train_test_split(nor_X,iris_data['target'], test_size=0.25,random_state = 0)
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print('模型评分:{:.2f}'.format(knn.score(X_test, y_test)))
print('模型评分2:{:.2f}'.format(knn.score(X_train, y_train)))

出行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#出行
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.metrics import accuracy_score

data = pd.read_csv('knn-chuxing.csv',encoding = 'gbk')
data.head()
X = data.iloc[:, 1:4]
Y = data.iloc[:, 4:5]

#标签化与独热编码
enc = OneHotEncoder()
enc.fit(X)

X = enc.transform(X).toarray()
y = np.array(Y)
z = y.tolist()
print(X)

#划分训练集和数据集
X_train, X_test, y_train, y_test = train_test_split(X, z, test_size=0.25,random_state = 0)

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
print('模型评分:{:.2f}'.format(knn.score(X_test, y_test)))
print('模型评分2:{:.2f}'.format(knn.score(X_train, y_train)))

#测试
X_new = [[1, 0, 0, 1, 0, 0, 1]]
pre = knn.predict(X_new)
pre

y_pred = knn.predict(X_test)
y_pred

2023/2/28

knn-StrawBerryAnalysis.csv 文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import train_test_split

#读取数据
import pandas as pd
df = pd.read_csv("knn-StrawBerryAnalysis.csv")

x = df.iloc[:, 1 : 7]
y = df.iloc[:, 7 : 8]
x = np.array(x)
y = np.array(y)
y = y.reshape(21793, )

#利用归一化处理数据
X, y = x, y
X_min = np.min(X, 0)
X_mn = np.max(X, 0) - X_min
nor_X = np.around((X - X_min) / X_mn, 4)

#画图判断k值
k_range = range(1, 31)
k_error = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)

scores = cross_val_score(knn, nor_X, y)
k_error.append(1 - scores.mean())


#画图,x轴为k值,y值为误差值
plt.plot(k_range, k_error)
plt.xlabel('Value of K for KNN')
plt.ylabel('Error')
plt.show()

X_train, X_test, y_train, y_test = train_test_split(nor_X , y, test_size=0.25,random_state = 0)

knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, y_train)

print('模型评分:{:.2f}'.format(knn.score(X_test, y_test)))
print('模型评分2:{:.2f}'.format(knn.score(X_train, y_train)))

口红,啤酒分类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import pandas as pd
df = pd.read_excel("data.xlsx")

train_x = df.iloc[1 : 13, 2 :]
test_x = df.iloc[13 : 17, 2 :]
train_y = df.iloc[1: 13, 1]
test_y = df.iloc[13 : 17, 1]

gnb = GaussianNB()
fitl = gnb.fit(train_x, train_y)
pre = gnb.predict(test_x)
r_n = (test_y == pre).sum()
print("总测试条目 = %d, 正确率 = %5.2f"%(len(test_x), float(r_n / len(test_x) * 100)))

#测试
x_new = df.iloc[17 : 19, 2 :]
pre = gnb.predict(x_new)
print(pre)

学习通视频里的模型评分, 用的列表实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import numpy as np
from sklearn.naive_bayes import GaussianNB

of = open('data.csv', 'r')
x = []
y = []

for line in of:
li_t = line.split(',')
y.append(int(li_t[4]))
x.append([ int(li_t[0]), int(li_t[1]), int(li_t[2]), int(li_t[3])])

fd = int(len(x) * 0.9)
train_x = x[: fd]
test_x = x[fd: ]
train_y = y[: fd]
test_y = y[fd: ]

gnb = GaussianNB()
fitl = gnb.fit(train_x, train_y)
pre = gnb.predict(test_x)
r_n = (test_y == pre).sum()
print("总测试条目 = %d, 正确率 = %5.2f"%(len(test_x), float(r_n / len(test_x) * 100)))