# Importing the dependencies
import warnings
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
#Loading the data set into data and target variables
X,Y = datasets.load_iris(return_X_y=True)
# training sets all the features as columns except the last column
# target values as last column of the data frame
# 10 different sets of features considering 2 and 3 columns at a time as mentioned in question
X1 = X[:,[0,1]]
X2 = X[:, [0,2]]
X3 = X[:, [0,3]]
X4 = X[:, [1,2]]
X5 = X[:, [1,3]]
X6 = X[:, [2,3]]
X7 = X[:, [0,1,2]]
X8 = X[:, [0,1,3]]
X9 = X[:, [0,2,3]]
X10 = X[:, [1,2,3]]
# data split into 80% for training and 20% for testing
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, Y, test_size=0.2, random_state=42)
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, Y, test_size=0.2, random_state=42)
x_train3, x_test3, y_train3, y_test3 = train_test_split(X3, Y, test_size=0.2, random_state=42)
x_train4, x_test4, y_train4, y_test4 = train_test_split(X4, Y, test_size=0.2, random_state=42)
x_train5, x_test5, y_train5, y_test5 = train_test_split(X5, Y, test_size=0.2, random_state=42)
x_train6, x_test6, y_train6, y_test6 = train_test_split(X6, Y, test_size=0.2, random_state=42)
x_train7, x_test7, y_train7, y_test7 = train_test_split(X7, Y, test_size=0.2, random_state=42)
x_train8, x_test8, y_train8, y_test8 = train_test_split(X8, Y, test_size=0.2, random_state=42)
x_train9, x_test9, y_train9, y_test9 = train_test_split(X9, Y, test_size=0.2, random_state=42)
x_train10, x_test10, y_train10, y_test10 = train_test_split(X10, Y, test_size=0.2, random_state=42)
#Training the model
logistic= LogisticRegression()
# Create grid search using 5-fold cross validation
param_grid = [
{'penalty' : ['l1', 'l2'],
'C' : [10,1,.1,.001],
'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
'max_iter' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 250, 500, 1000]
}
]
clf = GridSearchCV(logistic, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
lst=[]
X_train = [x_train, x_train1, x_train2, x_train3, x_train4, x_train5, x_train6, x_train7, x_train8, x_train9, x_train10]
Y_train = [y_train, y_train1, y_train2, y_train3, y_train4, y_train5, y_train6, y_train7, y_train8, y_train9, y_train10]
X_test = [x_test, x_test1, x_test2, x_test3, x_test4, x_test5, x_test6, x_test7, x_test8, x_test9, x_test10]
Y_test = [y_test, y_test1, y_test2, y_test3, y_test4, y_test5, y_test6, y_test7, y_test8, y_test9, y_test10 ]
for i in range(0,11):
scaler = preprocessing.StandardScaler().fit(X_train[i]) #scaling data
scaler.transform(X_train[i])
scaler.transform(X_test[i])
best_clf = clf.fit(X_train[i],Y_train[i]) #finding best model for the data by grid search
lst1=[]
lst1.append(best_clf.score(X_train[i],Y_train[i])) #storing the result for each case into list
lst1.append(best_clf.best_estimator_)
lst.append(lst1)
for i in range(0,11): #printing best accuracy and other important results for 11 cases
print("for case: ",i)
for j in range(0,2):
print(lst[i][j])
print("\n")
C = [10, 1, .1, .001] # a list of C values
lstl1=[]
lstl2=[]
for i in range(0,11):
scaler = preprocessing.StandardScaler().fit(X_train[i])
scaler.transform(X_train[i])
scaler.transform(X_test[i])
lst_l1=[]
lst_l2=[]
for c in C:
model_l1 = LogisticRegression(penalty='l1',C=c, solver='liblinear') # Applying l1 regularization for all c values
model_l1.fit(X_train[i],Y_train[i])
lst_l1.append(model_l1.score(X_test[i],Y_test[i]))
lstl1.append(lst_l1) # storing the accuracy for all c values of each case
model_l2 = LogisticRegression(C=c, solver='liblinear') # Applying l2 regularization for all c values
model_l2.fit(X_train[i],Y_train[i])
lst_l2.append(model_l2.score(X_test[i],Y_test[i]))
lstl2.append(lst_l2) # storing the accuracy for all c values of each case
for i in range(0,11):
print("for case: ",i)
print("\n")
for j in range(0,4): # printing accuracy values for analysis
print(lstl1[i][j])
print("\n")
for j in range(0,4):
print(lstl2[i][j])
print("\n")