jpchen609
diff --git a/‎Learning Data Mining with Python/Chapter1/.ipynb_checkpoints/ch1-OneR算法-checkpoint.ipynb‎
Lines changed: 398 additions & 0 deletions b/‎Learning Data Mining with Python/Chapter1/.ipynb_checkpoints/ch1-OneR算法-checkpoint.ipynb‎
Lines changed: 398 additions & 0 deletions
@@ -0,0 +1,398 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#OneR算法  其实是OneRule的简写 表示只用许多特征中的一个特征来作为分类依据\n",
+    "\n",
+    "#算法思想： 遍历每一个特征的每一个取值，对每一个特征值，统计它在各个类别中出现的次数，找到它出现次数最多的类别\n",
+    "#         并统计它在其他类别中的出现次数\n",
+    "\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iris Plants Database\n",
+      "\n",
+      "Notes\n",
+      "-----\n",
+      "Data Set Characteristics:\n",
+      "    :Number of Instances: 150 (50 in each of three classes)\n",
+      "    :Number of Attributes: 4 numeric, predictive attributes and the class\n",
+      "    :Attribute Information:\n",
+      "        - sepal length in cm\n",
+      "        - sepal width in cm\n",
+      "        - petal length in cm\n",
+      "        - petal width in cm\n",
+      "        - class:\n",
+      "                - Iris-Setosa\n",
+      "                - Iris-Versicolour\n",
+      "                - Iris-Virginica\n",
+      "    :Summary Statistics:\n",
+      "\n",
+      "    ============== ==== ==== ======= ===== ====================\n",
+      "                    Min  Max   Mean    SD   Class Correlation\n",
+      "    ============== ==== ==== ======= ===== ====================\n",
+      "    sepal length:   4.3  7.9   5.84   0.83    0.7826\n",
+      "    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n",
+      "    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n",
+      "    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)\n",
+      "    ============== ==== ==== ======= ===== ====================\n",
+      "\n",
+      "    :Missing Attribute Values: None\n",
+      "    :Class Distribution: 33.3% for each of 3 classes.\n",
+      "    :Creator: R.A. Fisher\n",
+      "    :Donor: Michael Marshall (MARSHALL%[email protected])\n",
+      "    :Date: July, 1988\n",
+      "\n",
+      "This is a copy of UCI ML iris datasets.\n",
+      "http://archive.ics.uci.edu/ml/datasets/Iris\n",
+      "\n",
+      "The famous Iris database, first used by Sir R.A Fisher\n",
+      "\n",
+      "This is perhaps the best known database to be found in the\n",
+      "pattern recognition literature.  Fisher's paper is a classic in the field and\n",
+      "is referenced frequently to this day.  (See Duda & Hart, for example.)  The\n",
+      "data set contains 3 classes of 50 instances each, where each class refers to a\n",
+      "type of iris plant.  One class is linearly separable from the other 2; the\n",
+      "latter are NOT linearly separable from each other.\n",
+      "\n",
+      "References\n",
+      "----------\n",
+      "   - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\n",
+      "     Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n",
+      "     Mathematical Statistics\" (John Wiley, NY, 1950).\n",
+      "   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n",
+      "     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n",
+      "   - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n",
+      "     Structure and Classification Rule for Recognition in Partially Exposed\n",
+      "     Environments\".  IEEE Transactions on Pattern Analysis and Machine\n",
+      "     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n",
+      "   - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\".  IEEE Transactions\n",
+      "     on Information Theory, May 1972, 431-433.\n",
+      "   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al\"s AUTOCLASS II\n",
+      "     conceptual clustering system finds 3 classes in the data.\n",
+      "   - Many, many more ...\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "#加载数据\n",
+    "\n",
+    "from sklearn.datasets import  load_iris\n",
+    "\n",
+    "dataset = load_iris()\n",
+    "\n",
+    "#得到数据和输出\n",
+    "X = dataset.data\n",
+    "y = dataset.target\n",
+    "\n",
+    "print(dataset.DESCR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "150 4\n"
+     ]
+    }
+   ],
+   "source": [
+    "#获得记录条数和特征数量\n",
+    "\n",
+    "n_samples, n_features = X.shape\n",
+    "\n",
+    "print(n_samples, n_features)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.46366666667\n",
+      "(150, 4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "attribute_means = X.mean()\n",
+    "print(attribute_means)\n",
+    "\n",
+    "X_d = np.array(X >= attribute_means, dtype='int')\n",
+    "print(X_d.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train:(112, 4)\n",
+      "test:(38, 4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#分成训练集合和测试集合\n",
+    "\n",
+    "from sklearn.cross_validation import  train_test_split \n",
+    "\n",
+    "random_state = 14\n",
+    "\n",
+    "X_train,X_test,y_train,y_test = train_test_split(X_d, y, random_state=random_state) #默认25%\n",
+    "\n",
+    "print(\"train:%s\\ntest:%s\" % (X_train.shape,X_test.shape))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from collections import  defaultdict\n",
+    "from operator import  itemgetter\n",
+    "\n",
+    "def train(X,y_true,feature):\n",
+    "    n_samples, n_features = X.shape\n",
+    "    assert 0 <= feature < n_features\n",
+    "    \n",
+    "    #得到训练集中的不同的值\n",
+    "    values = set(X[:,feature])\n",
+    "    \n",
+    "    predictors = dict()\n",
+    "    errors =[]\n",
+    "        \n",
+    "    \n",
+    "    for current_value in values:\n",
+    "        most_frequent_class,error = train_feauture_value(X,y_true,feature,current_value)\n",
+    "        predictors[current_value] = most_frequent_class\n",
+    "        errors.append(error)\n",
+    "    \n",
+    "    total_error = sum(errors)\n",
+    "    return predictors, total_error\n",
+    "\n",
+    "\n",
+    "#计算在一个特征值在哪个类别中出现的次数最多\n",
+    "def train_feauture_value(X,y_true, feature, value):\n",
+    "    class_counts = defaultdict(int)\n",
+    "    for sample,y in zip(X,y_true):\n",
+    "        #计算个体在各个类别中的个数\n",
+    "        if sample[feature] == value:\n",
+    "            class_counts[y] += 1\n",
+    "    #排序\n",
+    "    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1),reverse=True)\n",
+    "    most_frequent_class = sorted_class_counts[0][0]\n",
+    "    \n",
+    "    n_samples = X.shape[1]\n",
+    "    \n",
+    "    #计算在其他类别的次数\n",
+    "    error = sum([ class_counts for class_value,class_counts in class_counts.items() \n",
+    "                 if class_value != most_frequent_class])\n",
+    "    \n",
+    "    return most_frequent_class, error\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The best model is based on variable 2 and has error 37.00\n",
+      "{'predictor': {0: 0, 1: 2}, 'variable': 2}\n"
+     ]
+    }
+   ],
+   "source": [
+    "#得到所有的预测值\n",
+    "all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}\n",
+    "errors = {variable: error for variable, (mapping, error) in all_predictors.items()}\n",
+    "\n",
+    "#排序所有的测试值\n",
+    "best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]\n",
+    "print(\"The best model is based on variable {0} and has error {1:.2f}\".format(best_variable, best_error))\n",
+    "\n",
+    "# 选择最好的特征\n",
+    "model = {'variable': best_variable,\n",
+    "         'predictor': all_predictors[best_variable][0]}\n",
+    "print(model)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#使用单一特征预测\n",
+    "def predict(X_test, model):\n",
+    "    variable = model['variable']\n",
+    "    predictor = model['predictor']\n",
+    "    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])\n",
+    "    return y_predicted\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2\n",
+      " 2]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#输出预测值\n",
+    "y_predicted = predict(X_test, model)\n",
+    "print(y_predicted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy is 65.7894736842\n"
+     ]
+    }
+   ],
+   "source": [
+    "#计算准确度\n",
+    "accuracy = np.mean(y_predicted == y_test) * 100\n",
+    "print(\"accuracy is %s\"  % (accuracy) )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "             precision    recall  f1-score   support\n",
+      "\n",
+      "          0       0.94      1.00      0.97        17\n",
+      "          1       0.00      0.00      0.00        13\n",
+      "          2       0.40      1.00      0.57         8\n",
+      "\n",
+      "avg / total       0.51      0.66      0.55        38\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/xxg/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
+      "  'precision', 'predicted', average, warn_for)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#输出报告\n",
+    "from sklearn.metrics import  classification_report\n",
+    "print(classification_report(y_test, y_predicted))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [Root]",
+   "language": "python",
+   "name": "Python [Root]"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}