In [1]:
#!pip install zipcodes
In [2]:
#!pip install basemap
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipcodes as zcode
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB
from warnings import filterwarnings
filterwarnings("ignore")
In [4]:
Data = pd.read_csv('Bank_Personal_Loan_Modelling.csv')
df = pd.DataFrame(Data)
df
Out[4]:
ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1/60 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1/50 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1/00 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2/70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1/00 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1/90 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0/40 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0/30 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0/50 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0/80 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 14 columns
In [5]:
df.describe(include='all')
Out[5]:
ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.000000 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | 108 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
top | NaN | NaN | NaN | NaN | NaN | NaN | 0/30 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
freq | NaN | NaN | NaN | NaN | NaN | NaN | 241 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mean | 2500.500000 | 45.338400 | 20.104600 | 73.774200 | 93152.503000 | 2.396400 | NaN | 1.881000 | 56.498800 | 0.096000 | 0.104400 | 0.06040 | 0.596800 | 0.294000 |
std | 1443.520003 | 11.463166 | 11.467954 | 46.033729 | 2121.852197 | 1.147663 | NaN | 0.839869 | 101.713802 | 0.294621 | 0.305809 | 0.23825 | 0.490589 | 0.455637 |
min | 1.000000 | 23.000000 | -3.000000 | 8.000000 | 9307.000000 | 1.000000 | NaN | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
25% | 1250.750000 | 35.000000 | 10.000000 | 39.000000 | 91911.000000 | 1.000000 | NaN | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
50% | 2500.500000 | 45.000000 | 20.000000 | 64.000000 | 93437.000000 | 2.000000 | NaN | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 0.000000 |
75% | 3750.250000 | 55.000000 | 30.000000 | 98.000000 | 94608.000000 | 3.000000 | NaN | 3.000000 | 101.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 1.000000 |
max | 5000.000000 | 67.000000 | 43.000000 | 224.000000 | 96651.000000 | 4.000000 | NaN | 3.000000 | 635.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 |
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIP Code 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null object 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal Loan 5000 non-null int64 10 Securities Account 5000 non-null int64 11 CD Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: int64(13), object(1) memory usage: 547.0+ KB
In [7]:
# Substitute "/" with "." and convert the result to a float value.
df["CCAvg"] = df["CCAvg"].replace("/", ".",regex=True).astype("float")
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIP Code 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal Loan 5000 non-null int64 10 Securities Account 5000 non-null int64 11 CD Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB
In [9]:
#To avoid negative values in Experience, we will apply the absolute value function.
df['Experience'] = abs(df['Experience'])
In [10]:
df.describe(include="all")
Out[10]:
ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.000000 |
mean | 2500.500000 | 45.338400 | 20.134600 | 73.774200 | 93152.503000 | 2.396400 | 1.937938 | 1.881000 | 56.498800 | 0.096000 | 0.104400 | 0.06040 | 0.596800 | 0.294000 |
std | 1443.520003 | 11.463166 | 11.415189 | 46.033729 | 2121.852197 | 1.147663 | 1.747659 | 0.839869 | 101.713802 | 0.294621 | 0.305809 | 0.23825 | 0.490589 | 0.455637 |
min | 1.000000 | 23.000000 | 0.000000 | 8.000000 | 9307.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
25% | 1250.750000 | 35.000000 | 10.000000 | 39.000000 | 91911.000000 | 1.000000 | 0.700000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
50% | 2500.500000 | 45.000000 | 20.000000 | 64.000000 | 93437.000000 | 2.000000 | 1.500000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 0.000000 |
75% | 3750.250000 | 55.000000 | 30.000000 | 98.000000 | 94608.000000 | 3.000000 | 2.500000 | 3.000000 | 101.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 1.000000 |
max | 5000.000000 | 67.000000 | 43.000000 | 224.000000 | 96651.000000 | 4.000000 | 10.000000 | 3.000000 | 635.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 |
In [11]:
# Removing the ID column (as it's unnecessary) and rearranging the target column to be the last.
df_raw = df.drop(columns=["Personal Loan","ID"])
df_raw["Personal Loan"] = df["Personal Loan"]
df_raw.head()
Out[11]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
In [12]:
# Convert CCAvg to annual
df = df_raw
df['CCAvg'] = df['CCAvg'] * 12
df['CCAvg']
Out[12]:
0 19.2 1 18.0 2 12.0 3 32.4 4 12.0 ... 4995 22.8 4996 4.8 4997 3.6 4998 6.0 4999 9.6 Name: CCAvg, Length: 5000, dtype: float64
In [13]:
df['ZIP Code'].nunique()
Out[13]:
467
Check missing values¶
In [15]:
df.isnull().sum()
Out[15]:
Age 0 Experience 0 Income 0 ZIP Code 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Securities Account 0 CD Account 0 Online 0 CreditCard 0 Personal Loan 0 dtype: int64
no missing values
Noise¶
Categorical
In [19]:
Categorical = ['Family', 'Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
i = 0
while i<8:
fig = plt.figure(figsize = [20, 4])
plt.subplot(1, 3, 1)
sns.countplot(x = Categorical[i], data = df)
i += 1
if i==7:
break
plt.subplot(1, 3, 2)
sns.countplot(x = Categorical[i], data = df)
i += 1
plt.subplot(1, 3, 3)
sns.countplot(x = Categorical[i], data = df)
i+= 1
plt.show()
Numerical
In [21]:
Numerical = ['Age', 'Experience', 'CCAvg', 'Mortgage', 'Income']
i = 0
while i<6:
fig = plt.figure(figsize = [20, 4])
plt.subplot(1, 2, 1)
sns.boxplot(x = Numerical[i], data = df)
i += 1
if i==5:
break
plt.subplot(1, 2, 2)
sns.boxplot(x = Numerical[i], data=df)
i +=1
plt.show()
Data ranges are okay
Zip Codes
In [24]:
plt.scatter(df['ZIP Code'], df['Personal Loan'])
plt.annotate('maybe noise', xy = (9307, 0), xytext = (13000, 0.25), arrowprops = dict(facecolor = 'black', shrink = 0.05))
plt.show()
In [25]:
Noise_Zip = df[df['ZIP Code']<20000]
Noise_Zip
Out[25]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
384 | 51 | 25 | 21 | 9307 | 4 | 7.2 | 3 | 0 | 0 | 0 | 1 | 1 | 0 |
In [26]:
df.drop(index = 384, inplace = True)
In [27]:
list_zipcode = list(df['ZIP Code'])
In [28]:
county = []
lat=[]
long=[]
for x in list_zipcode:
my_city_county = zcode.matching(str(x))
if len(my_city_county)==1:
County = my_city_county[0].get('county')
Lat = my_city_county[0].get('lat')
Long = my_city_county[0].get('long')
else:
County = "Notfound"
Lat = np.NaN
Long = np.NaN
county.insert(0, County)
lat.insert(0, Lat)
long.insert(0, Long)
county.reverse()
lat.reverse()
long.reverse()
df['Place'] = county
df['Latitude']=lat
df['Longitude'] = long
In [29]:
print(zcode.matching(str(92612)))
df
[{'zip_code': '92612', 'zip_code_type': 'STANDARD', 'active': True, 'city': 'Irvine', 'acceptable_cities': [], 'unacceptable_cities': [], 'state': 'CA', 'county': 'Orange County', 'timezone': 'America/Los_Angeles', 'area_codes': ['714', '949'], 'world_region': 'NA', 'country': 'US', 'lat': '33.6615', 'long': '-117.8217'}]
Out[29]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | Place | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | 1 | 49 | 91107 | 4 | 19.2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Los Angeles County | 34.1620 | -118.0894 |
1 | 45 | 19 | 34 | 90089 | 3 | 18.0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Los Angeles County | 34.0218 | -118.2883 |
2 | 39 | 15 | 11 | 94720 | 1 | 12.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Alameda County | 37.8746 | -122.2547 |
3 | 35 | 9 | 100 | 94112 | 1 | 32.4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | San Francisco County | 37.7217 | -122.4446 |
4 | 35 | 8 | 45 | 91330 | 4 | 12.0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Los Angeles County | 34.2429 | -118.5273 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4995 | 29 | 3 | 40 | 92697 | 1 | 22.8 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | Orange County | 33.6473 | -117.8409 |
4996 | 30 | 4 | 15 | 92037 | 4 | 4.8 | 1 | 85 | 0 | 0 | 1 | 0 | 0 | San Diego County | 32.8668 | -117.2482 |
4997 | 63 | 39 | 24 | 93023 | 2 | 3.6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | Ventura County | 34.5210 | -119.2477 |
4998 | 65 | 40 | 49 | 90034 | 3 | 6.0 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | Los Angeles County | 34.0293 | -118.3994 |
4999 | 28 | 4 | 83 | 92612 | 3 | 9.6 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | Orange County | 33.6615 | -117.8217 |
4999 rows × 16 columns
926 is Irvine, California, columns are added as expected
In [31]:
df = df.astype({'Latitude':float, 'Longitude':float})
In [32]:
df.describe(include='all')
Out[32]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | Place | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999.000000 | 4999 | 4966.000000 | 4966.000000 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 39 | NaN | NaN |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Los Angeles County | NaN | NaN |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1095 | NaN | NaN |
mean | 45.337267 | 20.133627 | 73.784757 | 93169.275455 | 2.396079 | 23.258468 | 1.880776 | 56.510102 | 0.104421 | 0.060412 | 0.596719 | 0.293859 | 0.096019 | NaN | 35.792110 | -120.037847 |
std | 11.464033 | 11.416124 | 46.032281 | 1759.630610 | 1.147554 | 20.972776 | 0.839804 | 101.720837 | 0.305836 | 0.238273 | 0.490605 | 0.455574 | 0.294647 | NaN | 2.098821 | 2.092821 |
min | 23.000000 | 0.000000 | 8.000000 | 90005.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 32.554700 | -124.099400 |
25% | 35.000000 | 10.000000 | 39.000000 | 91911.000000 | 1.000000 | 8.400000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 34.003300 | -122.148500 |
50% | 45.000000 | 20.000000 | 64.000000 | 93437.000000 | 2.000000 | 18.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 35.373600 | -120.042400 |
75% | 55.000000 | 30.000000 | 98.000000 | 94608.000000 | 3.000000 | 30.000000 | 3.000000 | 101.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | NaN | 37.748700 | -118.089400 |
max | 67.000000 | 43.000000 | 224.000000 | 96651.000000 | 4.000000 | 120.000000 | 3.000000 | 635.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 41.758900 | -115.630500 |
In [34]:
df[df['Place'] == 'NotFound']['ZIP Code'].value_counts()
Out[34]:
Series([], Name: count, dtype: int64)
In [35]:
df['Place'].unique()
Out[35]:
array(['Los Angeles County', 'Alameda County', 'San Francisco County', 'San Diego County', 'Monterey County', 'Ventura County', 'Santa Barbara County', 'Marin County', 'Santa Clara County', 'Santa Cruz County', 'San Mateo County', 'Humboldt County', 'Contra Costa County', 'Orange County', 'Sacramento County', 'Yolo County', 'Placer County', 'San Bernardino County', 'San Luis Obispo County', 'Riverside County', 'Kern County', 'Notfound', 'Fresno County', 'Sonoma County', 'El Dorado County', 'San Benito County', 'Butte County', 'Solano County', 'Mendocino County', 'San Joaquin County', 'Imperial County', 'Siskiyou County', 'Merced County', 'Trinity County', 'Stanislaus County', 'Shasta County', 'Tuolumne County', 'Napa County', 'Lake County'], dtype=object)
In [36]:
def highlight_cols(s):
color = 'red'
return 'background-color: %s' % color
In [38]:
Z_92717 = list(df[df['ZIP Code']==92717].index)
for x in Z_92717:
df['Place'].loc[x] = 'Irvine'
df['Latitude'].loc[x] = 33.6462
df['Longitude'].loc[x] = -117.8398
df[df['ZIP Code'] == 92717].style.applymap(highlight_cols, subset = pd.IndexSlice[:, ['Personal Loan']])
Out[38]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | Place | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
106 | 43 | 17 | 69 | 92717 | 4 | 34.800000 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
172 | 38 | 13 | 171 | 92717 | 2 | 93.600000 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
184 | 52 | 26 | 63 | 92717 | 2 | 18.000000 | 2 | 0 | 1 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
321 | 44 | 20 | 101 | 92717 | 3 | 52.800000 | 2 | 82 | 0 | 0 | 0 | 0 | 1 | Irvine | 33.646200 | -117.839800 |
366 | 50 | 24 | 35 | 92717 | 1 | 3.600000 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
476 | 60 | 34 | 53 | 92717 | 1 | 9.600000 | 2 | 0 | 1 | 0 | 0 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
695 | 29 | 4 | 115 | 92717 | 1 | 22.800000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
721 | 49 | 24 | 39 | 92717 | 1 | 16.800000 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
1099 | 30 | 6 | 52 | 92717 | 3 | 8.400000 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
1189 | 42 | 17 | 115 | 92717 | 2 | 4.800000 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
1483 | 58 | 32 | 63 | 92717 | 1 | 19.200000 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
1752 | 33 | 8 | 155 | 92717 | 1 | 88.800000 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | Irvine | 33.646200 | -117.839800 |
1844 | 65 | 40 | 21 | 92717 | 3 | 1.200000 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
2049 | 43 | 18 | 94 | 92717 | 4 | 13.200000 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
2211 | 39 | 14 | 31 | 92717 | 2 | 16.800000 | 2 | 94 | 0 | 0 | 1 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
2428 | 39 | 12 | 108 | 92717 | 4 | 44.040000 | 2 | 301 | 0 | 0 | 0 | 1 | 1 | Irvine | 33.646200 | -117.839800 |
2486 | 61 | 36 | 130 | 92717 | 1 | 15.600000 | 1 | 257 | 0 | 0 | 0 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
2957 | 61 | 36 | 53 | 92717 | 3 | 6.000000 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
4090 | 42 | 18 | 49 | 92717 | 3 | 25.200000 | 3 | 0 | 1 | 0 | 1 | 0 | 0 | Irvine | 33.646200 | -117.839800 |
4276 | 50 | 24 | 155 | 92717 | 1 | 87.600000 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
4321 | 27 | 0 | 34 | 92717 | 1 | 24.000000 | 2 | 112 | 0 | 0 | 0 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
4384 | 45 | 20 | 61 | 92717 | 3 | 32.400000 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | Irvine | 33.646200 | -117.839800 |
In [39]:
Z_96651 = list(df[df['ZIP Code']==96651].index)
for x in Z_96651:
df['Place'].loc[x] = 'Rudno nad Hronom'
df['Latitude'].loc[x] = 48.4242
df['Longitude'].loc[x] = 18.7071
df[df['ZIP Code'] == 96651].style.applymap(highlight_cols, subset = pd.IndexSlice[:, ['Personal Loan']])
Out[39]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | Place | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
630 | 32 | 7 | 35 | 96651 | 3 | 15.600000 | 1 | 108 | 0 | 0 | 0 | 1 | 0 | Rudno nad Hronom | 48.424200 | 18.707100 |
672 | 51 | 27 | 23 | 96651 | 1 | 2.400000 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | Rudno nad Hronom | 48.424200 | 18.707100 |
1426 | 37 | 11 | 60 | 96651 | 3 | 6.000000 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | Rudno nad Hronom | 48.424200 | 18.707100 |
1653 | 26 | 1 | 24 | 96651 | 2 | 10.800000 | 3 | 123 | 0 | 0 | 0 | 1 | 0 | Rudno nad Hronom | 48.424200 | 18.707100 |
2731 | 29 | 5 | 28 | 96651 | 1 | 2.400000 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | Rudno nad Hronom | 48.424200 | 18.707100 |
3525 | 59 | 34 | 13 | 96651 | 4 | 10.800000 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Rudno nad Hronom | 48.424200 | 18.707100 |
In [40]:
df_for_mapping = df[df['Place']!= 'Rudno nad Hronom']
In [41]:
#img = plt.imread('map.jpg')
target_0 = df_for_mapping[df_for_mapping['Personal Loan']==0]
target_1 = df_for_mapping[df_for_mapping['Personal Loan']==1]
fig, axs = plt.subplots(figsize = (15, 10))
#axs.imshow(img, extent = [-126, -114, 32, 43], alpha = 0.7)
axs.scatter(target_0['Longitude'], target_0['Latitude'], c= 'cyan', alpha = 0.5, label = 'Loan 0')
axs.scatter(target_1['Longitude'], target_1['Latitude'], c= 'r', alpha = 0.5, label = 'Loan 1')
plt.title('Longitude & Latitude', fontsize = 30, backgroundcolor = 'black', c='white')
plt.xlabel('Longitude', fontsize = 20)
plt.xlabel('Latitude', fontsize = 20)
plt.grid()
plt.legend(loc = 'best')
Out[41]:
<matplotlib.legend.Legend at 0x7f88ecc24a90>
In [42]:
plt.figure(figsize = (15, 10))
sns.countplot(x = 'Place', data = df)
plt.title('Location', fontsize = 30, backgroundcolor = 'black', c = 'white')
plt.grid()
plt.xticks(rotation = 90, fontsize = 15)
plt.xlabel('place', fontsize = 20)
plt.ylabel('count', fontsize = 20)
plt.show()
In [43]:
plt.figure(figsize = (15, 10))
sns.countplot(y = 'Place', data = df)
plt.title('Location', fontsize = 30, backgroundcolor = 'black', c = 'white')
plt.grid()
plt.xticks(rotation = 90, fontsize = 15)
plt.xlabel('place', fontsize = 20)
plt.ylabel('count', fontsize = 20)
plt.show()
In [44]:
fig, ax = plt.subplots(figsize = (15, 10))
sns.heatmap(df.corr(numeric_only=True), cmap = 'RdBu_r', cbar = True, annot = True, linewidths = 0.5, ax = ax)
plt.show()
In [45]:
df.corr(numeric_only=True)
Out[45]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Personal Loan | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Age | 1.000000 | 0.993991 | -0.055164 | -0.030526 | -0.046566 | -0.051941 | 0.041211 | -0.012484 | -0.000403 | 0.008068 | 0.013622 | 0.007530 | -0.007694 | -0.027757 | -0.009929 |
Experience | 0.993991 | 1.000000 | -0.046785 | -0.030833 | -0.051981 | -0.049676 | 0.013808 | -0.011050 | -0.000960 | 0.009757 | 0.013982 | 0.008721 | -0.008276 | -0.028234 | -0.008615 |
Income | -0.055164 | -0.046785 | 1.000000 | -0.030725 | -0.157232 | 0.645931 | -0.187277 | 0.206712 | -0.002695 | 0.169703 | 0.014397 | -0.002030 | 0.502459 | -0.031680 | -0.023029 |
ZIP Code | -0.030526 | -0.030833 | -0.030725 | 1.000000 | 0.027532 | -0.012197 | -0.008254 | 0.003608 | 0.002418 | 0.021669 | 0.028328 | 0.024055 | -0.002977 | 0.856855 | -0.261323 |
Family | -0.046566 | -0.051981 | -0.157232 | 0.027532 | 1.000000 | -0.109088 | 0.064581 | -0.020294 | 0.020094 | 0.014184 | 0.010127 | 0.011160 | 0.061471 | 0.026448 | -0.012197 |
CCAvg | -0.051941 | -0.049676 | 0.645931 | -0.012197 | -0.109088 | 1.000000 | -0.135952 | 0.109829 | 0.015035 | 0.136504 | -0.003486 | -0.006454 | 0.366864 | -0.019101 | -0.020693 |
Education | 0.041211 | 0.013808 | -0.187277 | -0.008254 | 0.064581 | -0.135952 | 1.000000 | -0.033186 | -0.010723 | 0.014004 | -0.015227 | -0.011432 | 0.136834 | -0.013418 | 0.015991 |
Mortgage | -0.012484 | -0.011050 | 0.206712 | 0.003608 | -0.020294 | 0.109829 | -0.033186 | 1.000000 | -0.005449 | 0.089286 | -0.005904 | -0.007061 | 0.142065 | -0.003181 | -0.004485 |
Securities Account | -0.000403 | -0.000960 | -0.002695 | 0.002418 | 0.020094 | 0.015035 | -0.010723 | -0.005449 | 1.000000 | 0.317023 | 0.012685 | -0.014926 | 0.021932 | -0.004530 | -0.012094 |
CD Account | 0.008068 | 0.009757 | 0.169703 | 0.021669 | 0.014184 | 0.136504 | 0.014004 | 0.089286 | 0.317023 | 1.000000 | 0.175935 | 0.278792 | 0.316344 | 0.023647 | -0.020580 |
Online | 0.013622 | 0.013982 | 0.014397 | 0.028328 | 0.010127 | -0.003486 | -0.015227 | -0.005904 | 0.012685 | 0.175935 | 1.000000 | 0.003956 | 0.006332 | 0.028896 | -0.014014 |
CreditCard | 0.007530 | 0.008721 | -0.002030 | 0.024055 | 0.011160 | -0.006454 | -0.011432 | -0.007061 | -0.014926 | 0.278792 | 0.003956 | 1.000000 | 0.002903 | 0.015377 | -0.005542 |
Personal Loan | -0.007694 | -0.008276 | 0.502459 | -0.002977 | 0.061471 | 0.366864 | 0.136834 | 0.142065 | 0.021932 | 0.316344 | 0.006332 | 0.002903 | 1.000000 | -0.008494 | -0.010898 |
Latitude | -0.027757 | -0.028234 | -0.031680 | 0.856855 | 0.026448 | -0.019101 | -0.013418 | -0.003181 | -0.004530 | 0.023647 | 0.028896 | 0.015377 | -0.008494 | 1.000000 | -0.183501 |
Longitude | -0.009929 | -0.008615 | -0.023029 | -0.261323 | -0.012197 | -0.020693 | 0.015991 | -0.004485 | -0.012094 | -0.020580 | -0.014014 | -0.005542 | -0.010898 | -0.183501 | 1.000000 |
In [46]:
plt.figure(figsize = (15, 10))
sns.kdeplot(data = df, x = 'Income', y = 'Age', hue = 'Personal Loan', shade = True, fill = True, common_norm =False,
palette= 'crest', alpha = 0.5, linewidth = 0)
plt.title('Income-Age', fontsize = 20)
plt.xlabel('Income', fontsize = 20)
plt.ylabel ('Age', fontsize = 20)
plt.xticks(fontsize = 20)
plt.yticks(fontsize=20)
plt.show()
In [47]:
plt.figure(figsize = (15, 10))
sns.kdeplot(data = df, x = 'Age', y = 'CCAvg', hue = 'Personal Loan', shade = True, fill = True, common_norm =False,
palette= 'crest', alpha = 0.5, linewidth = 0)
plt.title('Age-CCAvg', fontsize = 20)
plt.xlabel('Age', fontsize = 20)
plt.ylabel ('CCAvg', fontsize = 20)
plt.xticks(fontsize = 20)
plt.yticks(fontsize=20)
plt.show()
In [100]:
X[X['Latitude'].isnull()]
Out[100]:
Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Securities Account | CD Account | Online | CreditCard | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
468 | 34 | 10 | 21 | 92634 | 1 | 6.0 | 3 | 0 | 0 | 0 | 1 | 0 | NaN | NaN |
780 | 32 | 7 | 42 | 92634 | 4 | 9.6 | 1 | 0 | 0 | 0 | 1 | 1 | NaN | NaN |
2218 | 38 | 13 | 9 | 92634 | 2 | 3.6 | 2 | 0 | 0 | 0 | 0 | 0 | NaN | NaN |
3887 | 24 | 2 | 118 | 92634 | 2 | 86.4 | 1 | 0 | 1 | 0 | 1 | 0 | NaN | NaN |
4392 | 52 | 27 | 81 | 92634 | 4 | 45.6 | 2 | 0 | 0 | 0 | 0 | 0 | NaN | NaN |
In [134]:
df2 = df.dropna()
df2.reset_index(drop= True, inplace = True)
X = df2.drop(columns = ['Personal Loan', 'Place'], axis =1)
y =df2['Personal Loan'].values.reshape(-1, 1)
X_train, X_test, y_train, y_test= train_test_split(X1, y, test_size = 0.2, random_state = 0)
LogReg¶
In [148]:
logreg = LogisticRegression(solver = 'liblinear', C=50)
logreg.fit(X_train, y_train.ravel())
y_pred = logreg.predict(X_test)
print('f1 score is: ', metrics.f1_score(y_test, y_pred))
f1 score is: 0.37762237762237766
In [150]:
def Replace(dataframe):
dataframe = df2.copy()
dataframe['Education'].replace([1,2,3], [4, 5, 6], inplace= True)
dataframe['Securities Account'].replace([1,2], [3, 4], inplace= True)
dataframe['CD Account'].replace([1,2], [3, 4], inplace= True)
dataframe['Online'].replace([1, 2], [3, 4], inplace= True)
return(dataframe)
In [285]:
def Logreg(X, y, Testsize):
df_evaluation=pd.DataFrame()
for x in Testsize:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = x, random_state = 0)
logreg = LogisticRegression(solver = 'liblinear', class_weight= 'balanced')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
dict = {'Test_size': x, 'acc': metrics.accuracy_score(y_test, y_pred), "score":logreg.score(X, y)}
dict_df = pd.DataFrame([dict])
df_evaluation = pd.concat([df_evaluation, dict_df], ignore_index = True)
return (X_train, X_test, y_train, y_test, y_pred, df_evaluation)
def highlight_max(s):
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
In [287]:
df_r = Replace(df2)
X = df_r.drop(columns=['Personal Loan', 'Place'])
y = df_r['Personal Loan']
In [289]:
X_train, X_test, y_train, y_test, y_pred, df_evaluation= Logreg(X, y, [.1, .15, .2, .25, .3])
df_evaluation.style.apply(highlight_max)
Out[289]:
Test_size | acc | score | |
---|---|---|---|
0 | 0.100000 | 0.892000 | 0.893472 |
1 | 0.150000 | 0.893333 | 0.894473 |
2 | 0.200000 | 0.894895 | 0.892070 |
3 | 0.250000 | 0.884708 | 0.889668 |
4 | 0.300000 | 0.887258 | 0.890669 |
In [214]:
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label = "data1")
plt.legend(loc = 4)
plt.show()
In [220]:
y_pred_proba = logreg.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label = "data1")
plt.legend(loc = 'best')
plt.show()
In [226]:
confusion_matrix(y_test, y_pred)
Out[226]:
array([[1210, 156], [ 13, 120]])
In [228]:
solvers = ['newton-cg', 'liblinear', 'lbfgs', 'sag', 'saga']
newton-cg¶
In [301]:
def Logreg(X, y, Testsize):
df_evaluation=pd.DataFrame()
for x in Testsize:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = x, random_state = 0)
logreg = LogisticRegression(solver = 'newton-cg', class_weight= 'balanced')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
dict = {'Test_size': x, 'acc': metrics.accuracy_score(y_test, y_pred), "score":logreg.score(X, y)}
dict_df = pd.DataFrame([dict])
df_evaluation = pd.concat([df_evaluation, dict_df], ignore_index = True)
return (X_train, X_test, y_train, y_test, y_pred, df_evaluation)
def highlight_max(s):
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
In [303]:
X_train, X_test, y_train, y_test, y_pred, df_evaluation= Logreg(X, y, [.1, .15, .2, .25, .3])
df_evaluation.style.apply(highlight_max)
Out[303]:
Test_size | acc | score | |
---|---|---|---|
0 | 0.100000 | 0.892000 | 0.895675 |
1 | 0.150000 | 0.897333 | 0.896476 |
2 | 0.200000 | 0.896897 | 0.895675 |
3 | 0.250000 | 0.886309 | 0.891670 |
4 | 0.300000 | 0.891261 | 0.893272 |
lbfgs¶
In [305]:
def Logreg(X, y, Testsize):
df_evaluation=pd.DataFrame()
for x in Testsize:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = x, random_state = 0)
logreg = LogisticRegression(solver = 'lbfgs', class_weight= 'balanced')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
dict = {'Test_size': x, 'acc': metrics.accuracy_score(y_test, y_pred), "score":logreg.score(X, y)}
dict_df = pd.DataFrame([dict])
df_evaluation = pd.concat([df_evaluation, dict_df], ignore_index = True)
return (X_train, X_test, y_train, y_test, y_pred, df_evaluation)
def highlight_max(s):
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
In [307]:
X_train, X_test, y_train, y_test, y_pred, df_evaluation= Logreg(X, y, [.1, .15, .2, .25, .3])
df_evaluation.style.apply(highlight_max)
Out[307]:
Test_size | acc | score | |
---|---|---|---|
0 | 0.100000 | 0.850000 | 0.856628 |
1 | 0.150000 | 0.890667 | 0.890268 |
2 | 0.200000 | 0.879880 | 0.879856 |
3 | 0.250000 | 0.881505 | 0.887665 |
4 | 0.300000 | 0.859239 | 0.859632 |
sag¶
In [313]:
def Logreg(X, y,Testsize):
df_evaluation=pd.DataFrame()
for x in Testsize:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=x, random_state =0)
logreg = LogisticRegression(solver = 'sag', class_weight= 'balanced')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
dict = {'Test_Size': x, 'acc': metrics.accuracy_score(y_test, y_pred), 'score':logreg.score(X, y)}
dict_df = pd.DataFrame([dict])
df_evaluation = pd.concat([df_evaluation, dict_df], ignore_index =True)
return (X_train, X_test, y_train, y_test, y_pred, df_evaluation)
def highlight_max(s):
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
In [315]:
X_train, X_test, y_train, y_test, y_pred, df_evaluation= Logreg(X, y, [.1, .15, .2, .25, .3])
df_evaluation.style.apply(highlight_max)
Out[315]:
Test_Size | acc | score | |
---|---|---|---|
0 | 0.100000 | 0.912000 | 0.903885 |
1 | 0.150000 | 0.421333 | 0.417901 |
2 | 0.200000 | 0.907908 | 0.903484 |
3 | 0.250000 | 0.800641 | 0.810773 |
4 | 0.300000 | 0.911274 | 0.903885 |
saga¶
In [317]:
def Logreg(X, y,Testsize):
df_evaluation=pd.DataFrame()
for x in Testsize:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=x, random_state =0)
logreg = LogisticRegression(solver = 'saga', class_weight= 'balanced')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
dict = {'Test_Size': x, 'acc': metrics.accuracy_score(y_test, y_pred), 'score':logreg.score(X, y)}
dict_df = pd.DataFrame([dict])
df_evaluation = pd.concat([df_evaluation, dict_df], ignore_index =True)
return (X_train, X_test, y_train, y_test, y_pred, df_evaluation)
def highlight_max(s):
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
In [319]:
X_train, X_test, y_train, y_test, y_pred, df_evaluation= Logreg(X, y, [.1, .15, .2, .25, .3])
df_evaluation.style.apply(highlight_max)
Out[319]:
Test_Size | acc | score | |
---|---|---|---|
0 | 0.100000 | 0.742000 | 0.751902 |
1 | 0.150000 | 0.864000 | 0.853624 |
2 | 0.200000 | 0.832833 | 0.828594 |
3 | 0.250000 | 0.909528 | 0.904686 |
4 | 0.300000 | 0.903269 | 0.893072 |
GridSearch¶
In [322]:
from sklearn.model_selection import KFold
logreg2 = LogisticRegression(solver = 'newton-cg', max_iter = 5000, n_jobs = -1)
params = {'penalty':['12', 'none'], 'C': [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200],
'class_weight': [None, 'balanced'], 'warm_start':[False, True]}
kfold = KFold(n_splits = 5, shuffle =True, random_state = 1)
gridsearch = GridSearchCV(logreg2, param_grid=params, scoring = 'f1', n_jobs = -1, cv =kfold)
In [ ]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore')
gridsearch.fit(X_train, y_train)
In [341]:
print(gridsearch.best_params_)
print(gridsearch.best_score_)
{'C': 0.1, 'class_weight': None, 'penalty': 'none', 'warm_start': False} 0.7184085514452588
KNN¶
In [397]:
training_acc = []
test_acc = []
neighbors_setting = range(1, 30)
for n_neighbors in neighbors_setting:
knn_model = KNeighborsClassifier(n_neighbors = n_neighbors)
knn_model.fit(X_train, y_train.ravel())
training_acc.append(knn_model.score(X_train, y_train))
test_acc.append(knn_model.score(X_test, y_test))
plt.figure(figsize = (15, 10))
plt.plot(neighbors_setting, training_acc, label = "Accuracy of the training set")
plt.plot(neighbors_setting, test_acc, label = "Accuracy of the test set")
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.grid()
plt.legend()
Out[397]:
<matplotlib.legend.Legend at 0x7f88d6b1d310>
In [399]:
print(np.max(training_acc))
print(np.max(test_acc))
1.0 0.914609739826551
In [401]:
knn_model = KNeighborsClassifier(9)
knn_model.fit(X_train, y_train.ravel())
y_pred = knn_model.predict(X_test)
print("Accuracy for knn: ", metrics.accuracy_score(y_test, y_pred))
Accuracy for knn: 0.9092728485657104
In [403]:
confusion_matrix(y_test, y_pred)
Out[403]:
array([[1338, 28], [ 108, 25]])
This con_matrix is for LogRes
In [406]:
from sklearn.model_selection import GridSearchCV
parameters = {"n_neighbors": range(1, 30) }
gridsearch2 = GridSearchCV(estimator = knn_model,
param_grid = parameters,
scoring = 'f1',
cv = 5,
verbose = 1,
n_jobs = -1)
gridsearch2.fit(X_train, y_train.ravel())
gridsearch2.best_params_
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Out[406]:
{'n_neighbors': 1}
In [407]:
print(gridsearch2.best_params_)
print(gridsearch2.best_score_)
{'n_neighbors': 1} 0.3653787148242307
GridSearch for logres
Naive Bayes¶
Complement
In [411]:
x = df.drop(columns=['Personal Loan', 'Place', 'Longitude', 'Latitude'])
y=df['Personal Loan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
clf = ComplementNB()
clf.fit(X_train, y_train.ravel())
y_pred = clf.predict(X_test)
print('Accuracy is', metrics.accuracy_score(y_test, y_pred))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[411], line 3 1 x = df.drop(columns=['Personal Loan', 'Place', 'Longitude', 'Latitude']) 2 y=df['Personal Loan'] ----> 3 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 4 clf = ComplementNB() 5 clf.fit(X_train, y_train.ravel()) File /opt/conda/envs/anaconda-panel-2023.05-py310/lib/python3.11/site-packages/sklearn/utils/_param_validation.py:211, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 205 try: 206 with config_context( 207 skip_parameter_validation=( 208 prefer_skip_nested_validation or global_skip_validation 209 ) 210 ): --> 211 return func(*args, **kwargs) 212 except InvalidParameterError as e: 213 # When the function is just a wrapper around an estimator, we allow 214 # the function to delegate validation to the estimator, but we replace 215 # the name of the estimator by the name of the function in the error 216 # message to avoid confusion. 217 msg = re.sub( 218 r"parameter of \w+ must be", 219 f"parameter of {func.__qualname__} must be", 220 str(e), 221 ) File /opt/conda/envs/anaconda-panel-2023.05-py310/lib/python3.11/site-packages/sklearn/model_selection/_split.py:2614, in train_test_split(test_size, train_size, random_state, shuffle, stratify, *arrays) 2611 if n_arrays == 0: 2612 raise ValueError("At least one array required as input") -> 2614 arrays = indexable(*arrays) 2616 n_samples = _num_samples(arrays[0]) 2617 n_train, n_test = _validate_shuffle_split( 2618 n_samples, test_size, train_size, default_test_size=0.25 2619 ) File /opt/conda/envs/anaconda-panel-2023.05-py310/lib/python3.11/site-packages/sklearn/utils/validation.py:455, in indexable(*iterables) 436 """Make arrays indexable for cross-validation. 437 438 Checks consistent length, passes through None, and ensures that everything (...) 451 sparse matrix, or dataframe) or `None`. 452 """ 454 result = [_make_indexable(X) for X in iterables] --> 455 check_consistent_length(*result) 456 return result File /opt/conda/envs/anaconda-panel-2023.05-py310/lib/python3.11/site-packages/sklearn/utils/validation.py:409, in check_consistent_length(*arrays) 407 uniques = np.unique(lengths) 408 if len(uniques) > 1: --> 409 raise ValueError( 410 "Found input variables with inconsistent numbers of samples: %r" 411 % [int(l) for l in lengths] 412 ) ValueError: Found input variables with inconsistent numbers of samples: [4994, 4999]
Naive Bayes alg. is not suitable for this dataset
In [392]:
df['Personal Loan'].value_counts()
Out[392]:
Personal Loan 0 4519 1 480 Name: count, dtype: int64
In [394]:
4519/480
Out[394]:
9.414583333333333
Imbalanced distribution (1:9.41)