How to use nearest neighbours for Classification?
0

How to use nearest neighbours for Classification?

This recipe helps you use nearest neighbours for Classification
In [2]:
## How to use nearest neighbours for Classification
def Snippet_155():
    print()
    print(format('## How to use nearest neighbours for Classification','*^82'))

    import warnings
    warnings.filterwarnings("ignore")

    # load libraries
    from sklearn import decomposition, datasets
    from sklearn import neighbors
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV, cross_val_score
    from sklearn.preprocessing import StandardScaler

    # Load the iris flower data
    dataset = datasets.make_classification(n_samples=1000, n_features=20, n_informative=5,
                n_redundant=2, n_repeated=0, n_classes=10, n_clusters_per_class=2,
                weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0,
                scale=1.0, shuffle=True, random_state=None)
    X = dataset[0]
    y = dataset[1]

    # Create an scaler object
    sc = StandardScaler()
    # Create a pca object
    pca = decomposition.PCA()
    # Create a logistic regression object with an L2 penalty
    KNN = neighbors.KNeighborsClassifier()
    # Create a pipeline of three steps. First, standardize the data.
    # Second, tranform the data with PCA.
    # Third, train a Decision Tree Classifier on the data.
    pipe = Pipeline(steps=[('sc', sc),
                           ('pca', pca),
                           ('KNN', KNN)])

    # Create Parameter Space
    # Create a list of a sequence of integers from 1 to 30 (the number of features in X + 1)
    n_components = list(range(1,X.shape[1]+1,1))
    # Create lists of parameter for KNeighborsRegressor()
    n_neighbors = [2, 3, 5, 10]
    algorithm = ['auto',  'ball_tree', 'kd_tree', 'brute']
    # Create a dictionary of all the parameter options 
    # Note has you can access the parameters of steps of a pipeline by using '__’
    parameters = dict(pca__n_components=n_components,
                      KNN__n_neighbors=n_neighbors,
                      KNN__algorithm=algorithm)

    # Conduct Parameter Optmization With Pipeline
    # Create a grid search object
    clf = GridSearchCV(pipe, parameters)
    # Fit the grid search
    clf.fit(X, y)

    # View The Best Parameters
    print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
    print(); print(clf.best_estimator_.get_params()['KNN'])

    # Use Cross Validation To Evaluate Model
    CV_Result = cross_val_score(clf, X, y, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
    print(); print(CV_Result)
    print(); print(CV_Result.mean())
    print(); print(CV_Result.std())

Snippet_155()
***************## How to use nearest neighbours for Classification****************
Best Number Of Components: 17

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform')
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[0.3115727  0.35223881 0.32926829]

0.3310265996499373

0.01664835868192384
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.4s finished