Skip to content

Commit fae53b9

Browse files
committed
add Chapter1
1 parent 2995e3e commit fae53b9

File tree

7 files changed

+2217
-73
lines changed

7 files changed

+2217
-73
lines changed
Lines changed: 398 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"#OneR算法 其实是OneRule的简写 表示只用许多特征中的一个特征来作为分类依据\n",
12+
"\n",
13+
"#算法思想: 遍历每一个特征的每一个取值,对每一个特征值,统计它在各个类别中出现的次数,找到它出现次数最多的类别\n",
14+
"# 并统计它在其他类别中的出现次数\n",
15+
"\n",
16+
"import numpy as np"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 2,
22+
"metadata": {
23+
"collapsed": false,
24+
"scrolled": false
25+
},
26+
"outputs": [
27+
{
28+
"name": "stdout",
29+
"output_type": "stream",
30+
"text": [
31+
"Iris Plants Database\n",
32+
"\n",
33+
"Notes\n",
34+
"-----\n",
35+
"Data Set Characteristics:\n",
36+
" :Number of Instances: 150 (50 in each of three classes)\n",
37+
" :Number of Attributes: 4 numeric, predictive attributes and the class\n",
38+
" :Attribute Information:\n",
39+
" - sepal length in cm\n",
40+
" - sepal width in cm\n",
41+
" - petal length in cm\n",
42+
" - petal width in cm\n",
43+
" - class:\n",
44+
" - Iris-Setosa\n",
45+
" - Iris-Versicolour\n",
46+
" - Iris-Virginica\n",
47+
" :Summary Statistics:\n",
48+
"\n",
49+
" ============== ==== ==== ======= ===== ====================\n",
50+
" Min Max Mean SD Class Correlation\n",
51+
" ============== ==== ==== ======= ===== ====================\n",
52+
" sepal length: 4.3 7.9 5.84 0.83 0.7826\n",
53+
" sepal width: 2.0 4.4 3.05 0.43 -0.4194\n",
54+
" petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n",
55+
" petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n",
56+
" ============== ==== ==== ======= ===== ====================\n",
57+
"\n",
58+
" :Missing Attribute Values: None\n",
59+
" :Class Distribution: 33.3% for each of 3 classes.\n",
60+
" :Creator: R.A. Fisher\n",
61+
" :Donor: Michael Marshall (MARSHALL%[email protected])\n",
62+
" :Date: July, 1988\n",
63+
"\n",
64+
"This is a copy of UCI ML iris datasets.\n",
65+
"http://archive.ics.uci.edu/ml/datasets/Iris\n",
66+
"\n",
67+
"The famous Iris database, first used by Sir R.A Fisher\n",
68+
"\n",
69+
"This is perhaps the best known database to be found in the\n",
70+
"pattern recognition literature. Fisher's paper is a classic in the field and\n",
71+
"is referenced frequently to this day. (See Duda & Hart, for example.) The\n",
72+
"data set contains 3 classes of 50 instances each, where each class refers to a\n",
73+
"type of iris plant. One class is linearly separable from the other 2; the\n",
74+
"latter are NOT linearly separable from each other.\n",
75+
"\n",
76+
"References\n",
77+
"----------\n",
78+
" - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\n",
79+
" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n",
80+
" Mathematical Statistics\" (John Wiley, NY, 1950).\n",
81+
" - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n",
82+
" (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n",
83+
" - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n",
84+
" Structure and Classification Rule for Recognition in Partially Exposed\n",
85+
" Environments\". IEEE Transactions on Pattern Analysis and Machine\n",
86+
" Intelligence, Vol. PAMI-2, No. 1, 67-71.\n",
87+
" - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n",
88+
" on Information Theory, May 1972, 431-433.\n",
89+
" - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n",
90+
" conceptual clustering system finds 3 classes in the data.\n",
91+
" - Many, many more ...\n",
92+
"\n"
93+
]
94+
}
95+
],
96+
"source": [
97+
"#加载数据\n",
98+
"\n",
99+
"from sklearn.datasets import load_iris\n",
100+
"\n",
101+
"dataset = load_iris()\n",
102+
"\n",
103+
"#得到数据和输出\n",
104+
"X = dataset.data\n",
105+
"y = dataset.target\n",
106+
"\n",
107+
"print(dataset.DESCR)"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": 3,
113+
"metadata": {
114+
"collapsed": false
115+
},
116+
"outputs": [
117+
{
118+
"name": "stdout",
119+
"output_type": "stream",
120+
"text": [
121+
"150 4\n"
122+
]
123+
}
124+
],
125+
"source": [
126+
"#获得记录条数和特征数量\n",
127+
"\n",
128+
"n_samples, n_features = X.shape\n",
129+
"\n",
130+
"print(n_samples, n_features)\n",
131+
"\n"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": 4,
137+
"metadata": {
138+
"collapsed": false,
139+
"scrolled": true
140+
},
141+
"outputs": [
142+
{
143+
"name": "stdout",
144+
"output_type": "stream",
145+
"text": [
146+
"3.46366666667\n",
147+
"(150, 4)\n"
148+
]
149+
}
150+
],
151+
"source": [
152+
"attribute_means = X.mean()\n",
153+
"print(attribute_means)\n",
154+
"\n",
155+
"X_d = np.array(X >= attribute_means, dtype='int')\n",
156+
"print(X_d.shape)"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": 5,
162+
"metadata": {
163+
"collapsed": false
164+
},
165+
"outputs": [
166+
{
167+
"name": "stdout",
168+
"output_type": "stream",
169+
"text": [
170+
"train:(112, 4)\n",
171+
"test:(38, 4)\n"
172+
]
173+
}
174+
],
175+
"source": [
176+
"#分成训练集合和测试集合\n",
177+
"\n",
178+
"from sklearn.cross_validation import train_test_split \n",
179+
"\n",
180+
"random_state = 14\n",
181+
"\n",
182+
"X_train,X_test,y_train,y_test = train_test_split(X_d, y, random_state=random_state) #默认25%\n",
183+
"\n",
184+
"print(\"train:%s\\ntest:%s\" % (X_train.shape,X_test.shape))"
185+
]
186+
},
187+
{
188+
"cell_type": "code",
189+
"execution_count": 11,
190+
"metadata": {
191+
"collapsed": true
192+
},
193+
"outputs": [],
194+
"source": [
195+
"from collections import defaultdict\n",
196+
"from operator import itemgetter\n",
197+
"\n",
198+
"def train(X,y_true,feature):\n",
199+
" n_samples, n_features = X.shape\n",
200+
" assert 0 <= feature < n_features\n",
201+
" \n",
202+
" #得到训练集中的不同的值\n",
203+
" values = set(X[:,feature])\n",
204+
" \n",
205+
" predictors = dict()\n",
206+
" errors =[]\n",
207+
" \n",
208+
" \n",
209+
" for current_value in values:\n",
210+
" most_frequent_class,error = train_feauture_value(X,y_true,feature,current_value)\n",
211+
" predictors[current_value] = most_frequent_class\n",
212+
" errors.append(error)\n",
213+
" \n",
214+
" total_error = sum(errors)\n",
215+
" return predictors, total_error\n",
216+
"\n",
217+
"\n",
218+
"#计算在一个特征值在哪个类别中出现的次数最多\n",
219+
"def train_feauture_value(X,y_true, feature, value):\n",
220+
" class_counts = defaultdict(int)\n",
221+
" for sample,y in zip(X,y_true):\n",
222+
" #计算个体在各个类别中的个数\n",
223+
" if sample[feature] == value:\n",
224+
" class_counts[y] += 1\n",
225+
" #排序\n",
226+
" sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1),reverse=True)\n",
227+
" most_frequent_class = sorted_class_counts[0][0]\n",
228+
" \n",
229+
" n_samples = X.shape[1]\n",
230+
" \n",
231+
" #计算在其他类别的次数\n",
232+
" error = sum([ class_counts for class_value,class_counts in class_counts.items() \n",
233+
" if class_value != most_frequent_class])\n",
234+
" \n",
235+
" return most_frequent_class, error\n",
236+
" \n"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": 12,
242+
"metadata": {
243+
"collapsed": false
244+
},
245+
"outputs": [
246+
{
247+
"name": "stdout",
248+
"output_type": "stream",
249+
"text": [
250+
"The best model is based on variable 2 and has error 37.00\n",
251+
"{'predictor': {0: 0, 1: 2}, 'variable': 2}\n"
252+
]
253+
}
254+
],
255+
"source": [
256+
"#得到所有的预测值\n",
257+
"all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}\n",
258+
"errors = {variable: error for variable, (mapping, error) in all_predictors.items()}\n",
259+
"\n",
260+
"#排序所有的测试值\n",
261+
"best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]\n",
262+
"print(\"The best model is based on variable {0} and has error {1:.2f}\".format(best_variable, best_error))\n",
263+
"\n",
264+
"# 选择最好的特征\n",
265+
"model = {'variable': best_variable,\n",
266+
" 'predictor': all_predictors[best_variable][0]}\n",
267+
"print(model)\n"
268+
]
269+
},
270+
{
271+
"cell_type": "code",
272+
"execution_count": 18,
273+
"metadata": {
274+
"collapsed": false
275+
},
276+
"outputs": [],
277+
"source": [
278+
"#使用单一特征预测\n",
279+
"def predict(X_test, model):\n",
280+
" variable = model['variable']\n",
281+
" predictor = model['predictor']\n",
282+
" y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])\n",
283+
" return y_predicted\n"
284+
]
285+
},
286+
{
287+
"cell_type": "code",
288+
"execution_count": 20,
289+
"metadata": {
290+
"collapsed": false
291+
},
292+
"outputs": [
293+
{
294+
"name": "stdout",
295+
"output_type": "stream",
296+
"text": [
297+
"[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2\n",
298+
" 2]\n"
299+
]
300+
}
301+
],
302+
"source": [
303+
"#输出预测值\n",
304+
"y_predicted = predict(X_test, model)\n",
305+
"print(y_predicted)"
306+
]
307+
},
308+
{
309+
"cell_type": "code",
310+
"execution_count": 22,
311+
"metadata": {
312+
"collapsed": false,
313+
"scrolled": true
314+
},
315+
"outputs": [
316+
{
317+
"name": "stdout",
318+
"output_type": "stream",
319+
"text": [
320+
"accuracy is 65.7894736842\n"
321+
]
322+
}
323+
],
324+
"source": [
325+
"#计算准确度\n",
326+
"accuracy = np.mean(y_predicted == y_test) * 100\n",
327+
"print(\"accuracy is %s\" % (accuracy) )"
328+
]
329+
},
330+
{
331+
"cell_type": "code",
332+
"execution_count": 23,
333+
"metadata": {
334+
"collapsed": false
335+
},
336+
"outputs": [
337+
{
338+
"name": "stdout",
339+
"output_type": "stream",
340+
"text": [
341+
" precision recall f1-score support\n",
342+
"\n",
343+
" 0 0.94 1.00 0.97 17\n",
344+
" 1 0.00 0.00 0.00 13\n",
345+
" 2 0.40 1.00 0.57 8\n",
346+
"\n",
347+
"avg / total 0.51 0.66 0.55 38\n",
348+
"\n"
349+
]
350+
},
351+
{
352+
"name": "stderr",
353+
"output_type": "stream",
354+
"text": [
355+
"/Users/xxg/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
356+
" 'precision', 'predicted', average, warn_for)\n"
357+
]
358+
}
359+
],
360+
"source": [
361+
"#输出报告\n",
362+
"from sklearn.metrics import classification_report\n",
363+
"print(classification_report(y_test, y_predicted))"
364+
]
365+
},
366+
{
367+
"cell_type": "code",
368+
"execution_count": null,
369+
"metadata": {
370+
"collapsed": true
371+
},
372+
"outputs": [],
373+
"source": []
374+
}
375+
],
376+
"metadata": {
377+
"anaconda-cloud": {},
378+
"kernelspec": {
379+
"display_name": "Python [Root]",
380+
"language": "python",
381+
"name": "Python [Root]"
382+
},
383+
"language_info": {
384+
"codemirror_mode": {
385+
"name": "ipython",
386+
"version": 3
387+
},
388+
"file_extension": ".py",
389+
"mimetype": "text/x-python",
390+
"name": "python",
391+
"nbconvert_exporter": "python",
392+
"pygments_lexer": "ipython3",
393+
"version": "3.5.1"
394+
}
395+
},
396+
"nbformat": 4,
397+
"nbformat_minor": 0
398+
}

0 commit comments

Comments
 (0)