PENUGASAN#

from google.colab import drive
drive.mount('/content/drive')

KeyboardInterruptTraceback (most recent call last)
<ipython-input-1-d5df0069828e> in <module>
      1 from google.colab import drive
----> 2 drive.mount('/content/drive')

/usr/local/lib/python3.8/dist-packages/google/colab/drive.py in mount(mountpoint, force_remount, timeout_ms, readonly)
     98 def mount(mountpoint, force_remount=False, timeout_ms=120000, readonly=False):
     99   """Mount your Google Drive at the specified mountpoint path."""
--> 100   return _mount(
    101       mountpoint,
    102       force_remount=force_remount,

/usr/local/lib/python3.8/dist-packages/google/colab/drive.py in _mount(mountpoint, force_remount, timeout_ms, ephemeral, readonly)
    121       'TBE_EPHEM_CREDS_ADDR'] if ephemeral else _os.environ['TBE_CREDS_ADDR']
    122   if ephemeral:
--> 123     _message.blocking_request(
    124         'request_auth', request={'authType': 'dfs_ephemeral'}, timeout_sec=None)
    125 

/usr/local/lib/python3.8/dist-packages/google/colab/_message.py in blocking_request(request_type, request, timeout_sec, parent)
    169   request_id = send_request(
    170       request_type, request, parent=parent, expect_reply=True)
--> 171   return read_reply_from_input(request_id, timeout_sec)

/usr/local/lib/python3.8/dist-packages/google/colab/_message.py in read_reply_from_input(message_id, timeout_sec)
     95     reply = _read_next_input_message()
     96     if reply == _NOT_READY or not isinstance(reply, dict):
---> 97       time.sleep(0.025)
     98       continue
     99     if (reply.get('type') == 'colab_reply' and

KeyboardInterrupt: 

Tugas 1#

import pandas as pd

p=pd.read_csv('https://raw.githubusercontent.com/Rosita19/datamining/main/drug200.csv')
p.head()

	Age	Sex	BP	Cholesterol	Na_to_K	Drug
0	23	F	HIGH	HIGH	25.355	DrugY
1	47	M	LOW	HIGH	13.093	drugC
2	47	M	LOW	HIGH	10.114	drugC
3	28	F	NORMAL	HIGH	7.798	drugX
4	61	F	LOW	HIGH	18.043	DrugY

import math

print("Data Nominal \nTempat Lahir - Agama \nA=[Jombang, Islam] \nB=[Mojokerto, Islam] \nC=[Jombang, Kristen] \nD=[Jombang, Katolik]\n")
print("Data Binary \nGender - StatusKawin \nA=[1, 1] \nB=[1, 0] \nC=[1, 0] \nD=[0, 1]\n")
print("Data Numeric \nUmur - Berat badan \nA=[20, 45] \nB=[25, 60] \nC=[50, 55] \nD=[35, 70]\n")

#Nominal
#Tempat lahir - agama
A=['Jombang', 'Islam']  
B=['Mojokerto', 'Islam']
C=['Jombang', 'Kristen']
D=['Jombang', 'Katolik']

data = input(str('pilihan \na = d(A,B) \nb = d(A,C) \nc = d(A,D) : '))
nominal=0
dataNominal=0
if data == 'a':
  for k in range (0,1,1):
    if A[k]==B[k]:
      nominal+=1
  dataNominal = (2-nominal)/2
  print("Hasil Nominal",dataNominal)
elif data == 'b':
  for l in range (0,1,1):
    if A[l]==C[l]:
      nominal+=1
  dataNominal = (2-nominal)/2
  print("Hasil Nominal",dataNominal)
elif data == 'c':
  for m in range (0,1,1):
    if A[m]==D[m]:
      nominal+=1
  dataNominal = (2-nominal)/2
  print("Hasil Nominal",dataNominal)
else:
  print('Data tidak sesuai pilihan!')


#numeric
#Umur - berat badan
A=[20, 45]
B=[25, 60]
C=[50, 55]
D=[35, 70]

dataNumeric=0
if data == 'a':
  total=(A[0]-B[0])*(A[0]-B[0])+(A[1]-B[1])*(A[1]-B[1])
  dataNumeric=math.sqrt(total)
  print("Hasil Numeric = ",dataNumeric)
elif data == 'b':
  total=(A[0]-C[0])*(A[0]-C[0])+(A[1]-C[1])*(A[1]-C[1])
  dataNumeric=math.sqrt(total)
  print("Hasil Numeric = ",dataNumeric)
elif data == 'c':
  total=(A[0]-D[0])*(A[0]-D[0])+(A[1]-D[1])*(A[1]-D[1])
  dataNumeric=math.sqrt(total)
  print("Hasil Numeric = ",dataNumeric)
else:
  print("Data tidak sesuai pilihan!")

#binary
#Gender - status kawin
A=[1, 1]
B=[1, 0]
C=[1, 0]
D=[0, 1]

dataBinary=0
q=0
r=0
s=0
t=0

if data == 'a':
  for i in range (0,2,1):
    if A[i]== 1 and B[i]==1:
      q+=1
    if A[i]==1 and B[i]==0:
      r+=1
    if A[i]==0 and B[i]==1:
      s+=1
    if A[i]==0 and B[i]==0:
      t+=1
  dataBinary=(r+s)/(q+r+s+t)
  print("Hasil Binary = ",dataBinary)
elif data == 'b':
  for i in range (0,2,1):
    if A[i]== 1 and C[i]==1:
      q+=1
    if A[i]==1 and C[i]==0:
      r+=1
    if A[i]==0 and C[i]==1:
      s+=1
    if A[i]==0 and C[i]==0:
      t+=1
  dataBinary=(r+s)/(q+r+s+t)
  print("Hasil Binary = ",dataBinary)
elif data == 'c':
  for i in range (0,2,1):
    if A[i]== 1 and D[i]==1:
      q+=1
    if A[i]==1 and D[i]==0:
      r+=1
    if A[i]==0 and D[i]==1:
      s+=1
    if A[i]==0 and D[i]==0:
      t+=1
  dataBinary=(r+s)/(q+r+s+t)
  print("Hasil Binary = ",dataBinary)
else:
  print('Data tidak sesuai pilihan!')

print()
print("Total = ", dataNominal+dataBinary+dataNumeric)

Data Nominal 
Tempat Lahir - Agama 
A=[Jombang, Islam] 
B=[Mojokerto, Islam] 
C=[Jombang, Kristen] 
D=[Jombang, Katolik]

Data Binary 
Gender - StatusKawin 
A=[1, 1] 
B=[1, 0] 
C=[1, 0] 
D=[0, 1]

Data Numeric 
Umur - Berat badan 
A=[20, 45] 
B=[25, 60] 
C=[50, 55] 
D=[35, 70]

pilihan 
a = d(A,B) 
b = d(A,C) 
c = d(A,D) : b
Hasil Nominal 0.5
Hasil Numeric =  31.622776601683793
Hasil Binary =  0.5

Total =  32.622776601683796

Tugas 2 : Diskritisasi#

iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-efdf948b94e3> in <module>
----> 1 iris=df.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/drug200.csv")
      2 iris

NameError: name 'df' is not defined

Equal Width Intervals

Equal-width intervals adalah discretization yang membagi data numerik menjadi beberapa kelompok dengan lebar kelompok yang kurang lebih sama besar.

sepal_length = data[["sepal.length"]]
petal_length = data[["petal.length"]]
sepal_width = data[["sepal.width"]]
petal_width = data[["petal.width"]]

membuat fungsi cut yang digunakan untuk mencaari interval menggunakan metode Equal-Width-Intervals

def cut(col, k):
  intervals = pd.cut(data[col], k).value_counts().index.to_list()
  return [[interval.left, interval.right] for interval in intervals]

def toCategory(list_interval, col):
  # get length interval
  length = len(list_interval)

  # sorting interval
  sort_interval = np.sort(list_interval, axis=0)

  # get category from interval
  categories = np.array([chr(65+i) for i in range(length)])[:, None]

  # Combine into interval data
  intervals = np.hstack((sort_interval, categories))

  # operate all data
  newCol = []
  for i, row in data.iterrows():
    d = row[col]
    for interval in intervals:
      if d >= interval[0].astype(float) and d <= interval[1].astype(float):
        newCol.append(interval[2])
        break

  # return new column category
  return np.array(newCol, dtype=str)

mencari interval dengan membaginya menjadi 3 bagian

interval_sepal_length = cut("sepal.length", 3)
interval_petal_length = cut("petal.length", 3)
interval_sepal_width = cut("sepal.width", 3)
interval_petal_width = cut("petal.width", 3)

print("interval sepal.length = ", interval_sepal_length)
print("interval petal.length = ", interval_petal_length)
print("interval sepal.width = ", interval_sepal_width)
print("interval petal.width = ", interval_petal_width)

interval sepal.length =  [[5.5, 6.7], [4.296, 5.5], [6.7, 7.9]]
interval petal.length =  [[2.967, 4.933], [0.994, 2.967], [4.933, 6.9]]
interval sepal.width =  [[2.8, 3.6], [1.998, 2.8], [3.6, 4.4]]
interval petal.width =  [[0.9, 1.7], [0.0976, 0.9], [1.7, 2.5]]

Menampilkan hasil dari pembagian category

sepal_length["category"] = toCategory(interval_sepal_length, "sepal.length")
petal_length["category"] = toCategory(interval_petal_length, "petal.length")
sepal_width["category"] = toCategory(interval_sepal_width, "sepal.width")
petal_width["category"] = toCategory(interval_petal_width, "petal.width")

display(sepal_length)
display(petal_length)
display(sepal_width)
display(petal_width)

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	B
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

150 rows × 2 columns

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

150 rows × 2 columns

	sepal.width	category
0	3.5	B
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	B
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	B
149	3.0	B

150 rows × 2 columns

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

150 rows × 2 columns

Equal Frequency Intervals

Equal-frequency intervals adalah discretization yang membagi data numerik menjadi beberapa kelompok dengan jumlah anggota yang kurang lebih sama besar

sepal_length = data[["sepal.length"]]
petal_length = data[["petal.length"]]
sepal_width = data[["sepal.width"]]
petal_width = data[["petal.width"]]

Pandas menyediakan method qcut untuk mencari nilai interval dari Equal_Frequency Intervals

def qcut(col, k):
  intervals = pd.qcut(data[col], k).value_counts().index.to_list()
  return [[interval.left, interval.right] for interval in intervals]

mencari interval dengan membaginya menjadi 3 bagian

interval_sepal_length = qcut("sepal.length", 3)
interval_petal_length = qcut("petal.length", 3)
interval_sepal_width = qcut("sepal.width", 3)
interval_petal_width = qcut("petal.width", 3)

print("interval sepal.length = ", interval_sepal_length)
print("interval petal.length = ", interval_petal_length)
print("interval sepal.width = ", interval_sepal_width)
print("interval petal.width = ", interval_petal_width)

interval sepal.length =  [[5.4, 6.3], [4.2989999999999995, 5.4], [6.3, 7.9]]
interval petal.length =  [[2.633, 4.9], [0.999, 2.633], [4.9, 6.9]]
interval sepal.width =  [[1.999, 2.9], [2.9, 3.2], [3.2, 4.4]]
interval petal.width =  [[0.867, 1.6], [0.099, 0.867], [1.6, 2.5]]

Menampilkan hasil pembagian category

sepal_length["category"] = toCategory(interval_sepal_length, "sepal.length")
petal_length["category"] = toCategory(interval_petal_length, "petal.length")
sepal_width["category"] = toCategory(interval_sepal_width, "sepal.width")
petal_width["category"] = toCategory(interval_petal_width, "petal.width")

display(sepal_length)
display(petal_length)
display(sepal_width)
display(petal_width)

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	C
146	6.3	B
147	6.5	C
148	6.2	B
149	5.9	B

150 rows × 2 columns

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

150 rows × 2 columns

	sepal.width	category
0	3.5	C
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	C
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	C
149	3.0	B

150 rows × 2 columns

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

150 rows × 2 columns

Entropy

Entropi adalah nilai informasi yang menyatakan ukuran ketidakpastian(impurity) dari attribut dari suatu kumpulan obyek data dalam satuan bit

membuat sampel untuk dianalisis

sample = data[["sepal.length"]]
sample.describe()

	sepal.length
count	150.000000
mean	5.843333
std	0.828066
min	4.300000
25%	5.100000
50%	5.800000
75%	6.400000
max	7.900000

membuat category random untuk semua data

np.random.seed(0)
sample["category"] = np.where(np.random.choice(2, sample.shape[0]) < 1, "A", "B")
sample

	sepal.length	category
0	5.1	A
1	4.9	B
2	4.7	B
3	4.6	A
4	5.0	B
...	...	...
145	6.7	A
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

150 rows × 2 columns

membuat fungsi getOverCategory yang digunakan untuk menghitung data keseluruhan yang nantinya digunakan untuk menghitung entropy

def getOverCategory(col):
  group = sample.loc[:, :].groupby("category").count()
  a = group.loc["A", col]
  b = group.loc["B", col]
  return (a, b, a+b)

fungsi splitter digunakan untuk membuat split antara value yang telah ditentukan lalu mengembalikan data yang telah dipisahkan

def splitter(value:float, col:str)->tuple:
  # get data less and greater from value
  less = sample[sample[col] <= value]
  greater = sample[sample[col] > value]

  # calculate into category for each data
  less_group = less.loc[:, :].groupby("category").count()
  greater_group = greater.loc[:, :].groupby("category").count()

  # get value based on category
  less_category_A = less_group.loc["A", col] 
  less_category_B = less_group.loc["B", col] 
  greater_category_A = greater_group.loc["A", col] 
  greater_category_B = greater_group.loc["B", col] 

  return (
      [less_category_A, less_category_B, less_category_A + less_category_B],
      [greater_category_A, greater_category_B, greater_category_A + greater_category_B]
  )

Membuat fungsi entropy untuk mencari nilai entropy

Rumus Mencari Entropy :

\[Entropy (D_{1}) = - \sum_{i=1}^{m} pi \log_{2} pi\]

def entropy(d):
  r1 = (d[0] / d[2]) * np.log2(d[0] / d[2]) 
  r2 = (d[1] / d[2]) * np.log2(d[1] / d[2]) 
  return np.sum([r1, r2]) * -1

Membuat fungsi info

def info(d):
  r1 = (d[0][2] / sample.shape[0]) * entropy(d[0])
  r2 = (d[1][2] / sample.shape[0]) * entropy(d[1])
  return r1 + r2

Membuat fungsi gain untuk menghitung selisih antara entropy awal dengan yang baru.

Rumus Mencari Gain : $$Gain(E_{new}) = (E_{initial}) - (E_{new})$$

def gain(Einitial, Enew):
  return Einitial - Enew

Membuat DInitial

D = getOverCategory("sepal.length")
entropy_d = entropy(D)
print(D)
print(entropy_d)

(68, 82, 150)
0.993707106604508

Melakukan beberapa tes split untuk mencari hasil dan informasi yang terbaik

Tes Pertama : Split 1:4.4

split1  = splitter(4.4, "sepal.length")
info_split1 = info(split1)
gain(entropy_d, info_split1)

0.003488151753460178

Tes Kedua : Split 2:5.5

split2  = splitter(5.5, "sepal.length")
info_split2 = info(split2)
gain(entropy_d, info_split2)

0.012302155146638905

Tes Ketiga : Split 3:7.0

split3  = splitter(7.0, "sepal.length")
info_split3 = info(split3)
gain(entropy_d, info_split3)

0.0005490214732508658

Dari seluruh hasil percobaan tes split yang telah dilakukan, maka diperoleh hasil split terbaik adalah split 3 yang mwemberikan keuntungan informasi sebesar 0.0005490214732508658, karena hasil tes split 3 memiliki nilai split yang terendah.

Tugas 3 : KNN(K-Nearest Neighbor)#

%matplotlib inline

!pip install -U scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (1.0.2)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.1.0)
Requirement already satisfied: numpy>=1.14.6 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.21.6)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (3.1.0)
Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.7.3)

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.inspection import *
from sklearn.model_selection import train_test_split

from sklearn.datasets import load_iris

from joblib.externals.cloudpickle import load
iris = load_iris()

type(iris)

sklearn.utils.Bunch

iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.2],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.6, 1.4, 0.1],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1. ],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5. , 2. , 3.5, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [6. , 2.2, 4. , 1. ],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4. , 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3. , 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3. , 5. , 1.7],
       [6. , 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [5.8, 2.7, 3.9, 1.2],
       [6. , 2.7, 5.1, 1.6],
       [5.4, 3. , 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3. , 4.1, 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.6, 4. , 1.2],
       [5. , 2.3, 3.3, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3. , 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6. , 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3. , 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3. , 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2. ],
       [7.7, 2.8, 6.7, 2. ],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6. , 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3. , 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2]

print(iris.target_names)

['setosa' 'versicolor' 'virginica']

print(type(iris.data))
print(type(iris.target))
x = iris.data
y = iris.target

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>

print(iris.data.shape)

(150, 4)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

print(x_train.shape)
print(x_test.shape)

(120, 4)
(30, 4)

print(y_train.shape)
print(y_test.shape)

(120,)
(30,)

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
k_range = range(1, 26)
scores = {}
scores_list = []
for k in k_range:
  knn = KNeighborsClassifier(n_neighbors=k)
  knn.fit(x_train, y_train)
  y_pred = knn.predict(x_test)
  scores[k] = metrics.accuracy_score(y_test, y_pred)
  scores_list.append(metrics.accuracy_score(y_test, y_pred))

%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(k_range, scores_list)
plt.xlabel('value of K for Knn')
plt.ylabel('Testing Accuracy')

Text(0, 0.5, 'Testing Accuracy')

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x,y)

KNeighborsClassifier()

classes= {0:'sentosa', 1:'versicolor', 2:'virginica'}
x_new = [[3,4,5,2],[5,4,2,2]]
y_predict = knn.predict(x_new)
print(classes[y_predict[0]])
print(classes[y_predict[1]])

versicolor
sentosa

Tugas 4 : NAIVE BAYES CLASSIFIER#

# Naive Bayes Classification

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd

iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris

	sepal.length	sepal.width	petal.length	petal.width	variety
0	5.1	3.5	1.4	0.2	Setosa
1	4.9	3.0	1.4	0.2	Setosa
2	4.7	3.2	1.3	0.2	Setosa
3	4.6	3.1	1.5	0.2	Setosa
4	5.0	3.6	1.4	0.2	Setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	Virginica
146	6.3	2.5	5.0	1.9	Virginica
147	6.5	3.0	5.2	2.0	Virginica
148	6.2	3.4	5.4	2.3	Virginica
149	5.9	3.0	5.1	1.8	Virginica

150 rows × 5 columns

X = iris.iloc[:,0:4].values
y = iris.iloc[:,4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 82)

# Feature Scaling to bring the variable in a single scale
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Naive Bayes Classification to the Training set with linear kernel
from sklearn.naive_bayes import GaussianNB
nvclassifier = GaussianNB()
nvclassifier.fit(X_train, y_train)

GaussianNB()

# Predicting the Test set results
y_pred = nvclassifier.predict(X_test)
print(y_pred)

['Virginica' 'Virginica' 'Setosa' 'Setosa' 'Setosa' 'Virginica'
 'Versicolor' 'Versicolor' 'Versicolor' 'Versicolor' 'Versicolor'
 'Virginica' 'Setosa' 'Setosa' 'Setosa' 'Setosa' 'Virginica' 'Versicolor'
 'Setosa' 'Versicolor' 'Setosa' 'Virginica' 'Setosa' 'Virginica'
 'Virginica' 'Versicolor' 'Virginica' 'Setosa' 'Virginica' 'Versicolor']

#lets see the actual and predicted value side by side
y_compare = np.vstack((y_test,y_pred)).T
#actual value on the left side and predicted value on the right hand side
#printing the top 5 values
y_compare[:5,:]

array([['Virginica', 'Virginica'],
       ['Virginica', 'Virginica'],
       ['Setosa', 'Setosa'],
       ['Setosa', 'Setosa'],
       ['Setosa', 'Setosa']], dtype=object)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[11  0  0]
 [ 0  8  1]
 [ 0  1  9]]

#finding accuracy from the confusion matrix.
a = cm.shape
corrPred = 0
falsePred = 0

for row in range(a[0]):
    for c in range(a[1]):
        if row == c:
            corrPred +=cm[row,c]
        else:
            falsePred += cm[row,c]
print('Correct predictions: ', corrPred)
print('False predictions', falsePred)
print ('\n\nAccuracy of the Naive Bayes Clasification is: ', corrPred/(cm.sum())) 

Correct predictions:  28
False predictions 2


Accuracy of the Naive Bayes Clasification is:  0.9333333333333333

Versi 2

from sklearn.metrics import make_scorer, accuracy_score,precision_score
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import KFold,train_test_split,cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

X = iris.iloc[:,0:4].values
y = iris.iloc[:,4].values

y.shape

(150,)

X.shape

(150, 4)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

#Train and Test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test) 
accuracy_nb=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for Naive Bayes\n',cm)
print('accuracy_Naive Bayes: %.3f' %accuracy)
print('precision_Naive Bayes: %.3f' %precision)
print('recall_Naive Bayes: %.3f' %recall)
print('f1-score_Naive Bayes : %.3f' %f1)

Confusion matrix for Naive Bayes
 [[16  0  0]
 [ 0 18  0]
 [ 0  0 11]]
accuracy_Naive Bayes: 1.000
precision_Naive Bayes: 1.000
recall_Naive Bayes: 1.000
f1-score_Naive Bayes : 1.000

from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=4,
                              n_informative=2, n_redundant=0,
                              random_state=0, shuffle=False)

clf = BaggingClassifier(base_estimator=SVC(),
                         n_estimators=10, random_state=0).fit(X, y)
clf.predict([[0, 0, 0, 0]])

array([1])

Versi Bagging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

N = 1000
data = np.arange(N)
BS = np.random.choice(data, size = N)
BS_unique = set(BS)
len(BS_unique)

wine_pd = pd.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/wine.csv")
wine_pd.head()

	Alcohol	Malic_acid	Ash	Alcalinity_of_ash	Magnesium	Total_phenols	Flavanoids	Nonflavanoid_phenols	Proanthocyanins	Color_intensity	Hue	OD280/OD315_of_diluted_wines	Proline	class
0	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065	Type1
1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050	Type1
2	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185	Type1
3	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480	Type1
4	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735	Type1

y = wine_pd.pop('class').values
X = wine_pd.values
X.shape

(178, 13)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
dtree = DecisionTreeClassifier(criterion='entropy')

# A helper function that will run RepeatedKFold cross validation for a range 
# of ensemble sizes (est_range).
# Takes, the estimator, n_reps and the range as arguments. 
def eval_bag_est_range(the_est, n_reps, est_range, folds = 10):
    n_est_dict = {}
    for n_est in est_range: 
        the_bag = BaggingClassifier(the_est, 
                            n_estimators = n_est,
                            max_samples = 1.0, # bootstrap resampling 
                            bootstrap = True)
        bag_cv = cross_validate(the_bag, X, y, n_jobs=-1,
                                cv=RepeatedKFold(n_splits=folds, n_repeats=n_reps)) 
        n_est_dict[n_est]=bag_cv['test_score'].mean()
    return n_est_dict

kNNpipe  = Pipeline(steps=[ ('scaler', StandardScaler()),
                           ('classifier', KNeighborsClassifier(n_neighbors=1))])

NNPipe = Pipeline(steps=[ ('scaler', StandardScaler()),
                           ('classifier', MLPClassifier(solver='lbfgs', alpha=1e-5,
                                                        hidden_layer_sizes=(5, 2)))])

res_kNN_bag  = eval_bag_est_range(kNNpipe, 10, range(2,16))

kNN_list = sorted(res_kNN_bag.items()) # sorted by key, return a list of tuples
nc, kNN_accs = zip(*kNN_list) # unpack a list of pairs into two tuples
NN_list = sorted(res_NN_bag.items()) # sorted by key, return a list of tuples
nc, NN_accs = zip(*NN_list) # unpack a list of pairs into two tuples

f = plt.figure(figsize=(5,4))

plt.plot(nc, NN_accs, lw = 2, color = 'r', label = 'Neural Net')
plt.plot(nc, kNN_accs, lw = 2, color = 'orange', label = 'k-NN')

plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.ylim([0.94,1])
plt.legend(loc = 'upper left')
plt.grid(axis = 'y')
f.savefig('bag-est-plot.pdf')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-53-6900fa04807b> in <module>
      1 kNN_list = sorted(res_kNN_bag.items()) # sorted by key, return a list of tuples
      2 nc, kNN_accs = zip(*kNN_list) # unpack a list of pairs into two tuples
----> 3 NN_list = sorted(res_NN_bag.items()) # sorted by key, return a list of tuples
      4 nc, NN_accs = zip(*NN_list) # unpack a list of pairs into two tuples
      5 

NameError: name 'res_NN_bag' is not defined

Tugas 5 : K-Means Clustering#

Import Library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

Import Data Dari Github

from os import X_OK
iris = pd.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/iris.csv")
X_OK = iris.iloc[:, [0, 1, 2, 3]].values

Menampilkan Data Iris tanpa label

X = iris.values[:, 0:4]
y = iris.values[:, 4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3.0, 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5.0, 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5.0, 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3.0, 1.4, 0.1],
       [4.3, 3.0, 1.1, 0.1],
       [5.8, 4.0, 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1.0, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5.0, 3.0, 1.6, 0.2],
       [5.0, 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.0, 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [4.4, 3.0, 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5.0, 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5.0, 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3.0, 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5.0, 3.3, 1.4, 0.2],
       [7.0, 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4.0, 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1.0],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5.0, 2.0, 3.5, 1.0],
       [5.9, 3.0, 4.2, 1.5],
       [6.0, 2.2, 4.0, 1.0],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3.0, 4.5, 1.5],
       [5.8, 2.7, 4.1, 1.0],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4.0, 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3.0, 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3.0, 5.0, 1.7],
       [6.0, 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1.0],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1.0],
       [5.8, 2.7, 3.9, 1.2],
       [6.0, 2.7, 5.1, 1.6],
       [5.4, 3.0, 4.5, 1.5],
       [6.0, 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3.0, 4.1, 1.3],
       [5.5, 2.5, 4.0, 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3.0, 4.6, 1.4],
       [5.8, 2.6, 4.0, 1.2],
       [5.0, 2.3, 3.3, 1.0],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3.0, 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3.0, 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6.0, 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3.0, 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3.0, 5.8, 2.2],
       [7.6, 3.0, 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2.0],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3.0, 5.5, 2.1],
       [5.7, 2.5, 5.0, 2.0],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3.0, 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6.0, 2.2, 5.0, 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2.0],
       [7.7, 2.8, 6.7, 2.0],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6.0, 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3.0, 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3.0, 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2.0],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3.0, 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6.0, 3.0, 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3.0, 5.2, 2.3],
       [6.3, 2.5, 5.0, 1.9],
       [6.5, 3.0, 5.2, 2.0],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3.0, 5.1, 1.8]], dtype=object)

iris.info()
iris[0:10]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
5	5.4	3.9	1.7	0.4	setosa
6	4.6	3.4	1.4	0.3	setosa
7	5.0	3.4	1.5	0.2	setosa
8	4.4	2.9	1.4	0.2	setosa
9	4.9	3.1	1.5	0.1	setosa

#Frequency distribution of species"
iris_outcome = pd.crosstab(index=iris["species"],  # Make a crosstab
columns="count")      # Name the count column

iris_outcome

col_0	count
species
setosa	50
versicolor	50
virginica	50

iris_setosa=iris.loc[iris["species"]=="Iris-setosa"]
iris_virginica=iris.loc[iris["species"]=="Iris-virginica"]
iris_versicolor=iris.loc[iris["species"]=="Iris-versicolor"]

sns.FacetGrid(iris,hue="species",size=3).map(sns.distplot,"sepal_length").add_legend()
sns.FacetGrid(iris,hue="species",size=3).map(sns.distplot,"sepal_width").add_legend()
sns.FacetGrid(iris,hue="species",size=3).map(sns.distplot,"petal_length").add_legend()
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

sns.boxplot(x="species",y="sepal_length",data=iris)
plt.show()

sns.violinplot(x="species",y="sepal_length",data=iris)
plt.show()

sns.set_style("whitegrid")
sns.pairplot(iris,hue="species",size=3);
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

#Finding the optimum number of clusters for k-means classification
from sklearn.cluster import KMeans
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
wcss

[681.3706,
3479517603579,
851441426146,
22847321428572,
47223015873017,
03998724608726,
29971212121213,
06311061745273,
271721728563833,
09432474054042]

plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()

kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(x)
y_kmeans

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int32)

from  sklearn.decomposition import PCA
pca=PCA(n_components=2)
X_new=pca.fit_transform(x)
X_new

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,  0.31203998],
       [-2.63198939, -0.19696122],
       [-2.58739848, -0.20431849],
       [-2.4099325 ,  0.41092426],
       [-2.64886233,  0.81336382],
       [-2.59873675,  1.09314576],
       [-2.63692688, -0.12132235],
       [-2.86624165,  0.06936447],
       [-2.62523805,  0.59937002],
       [-2.80068412,  0.26864374],
       [-2.98050204, -0.48795834],
       [-2.59000631,  0.22904384],
       [-2.77010243,  0.26352753],
       [-2.84936871, -0.94096057],
       [-2.99740655, -0.34192606],
       [-2.40561449,  0.18887143],
       [-2.20948924,  0.43666314],
       [-2.71445143, -0.2502082 ],
       [-2.53814826,  0.50377114],
       [-2.83946217, -0.22794557],
       [-2.54308575,  0.57941002],
       [-2.70335978,  0.10770608],
       [ 1.28482569,  0.68516047],
       [ 0.93248853,  0.31833364],
       [ 1.46430232,  0.50426282],
       [ 0.18331772, -0.82795901],
       [ 1.08810326,  0.07459068],
       [ 0.64166908, -0.41824687],
       [ 1.09506066,  0.28346827],
       [-0.74912267, -1.00489096],
       [ 1.04413183,  0.2283619 ],
       [-0.0087454 , -0.72308191],
       [-0.50784088, -1.26597119],
       [ 0.51169856, -0.10398124],
       [ 0.26497651, -0.55003646],
       [ 0.98493451, -0.12481785],
       [-0.17392537, -0.25485421],
       [ 0.92786078,  0.46717949],
       [ 0.66028376, -0.35296967],
       [ 0.23610499, -0.33361077],
       [ 0.94473373, -0.54314555],
       [ 0.04522698, -0.58383438],
       [ 1.11628318, -0.08461685],
       [ 0.35788842, -0.06892503],
       [ 1.29818388, -0.32778731],
       [ 0.92172892, -0.18273779],
       [ 0.71485333,  0.14905594],
       [ 0.90017437,  0.32850447],
       [ 1.33202444,  0.24444088],
       [ 1.55780216,  0.26749545],
       [ 0.81329065, -0.1633503 ],
       [-0.30558378, -0.36826219],
       [-0.06812649, -0.70517213],
       [-0.18962247, -0.68028676],
       [ 0.13642871, -0.31403244],
       [ 1.38002644, -0.42095429],
       [ 0.58800644, -0.48428742],
       [ 0.80685831,  0.19418231],
       [ 1.22069088,  0.40761959],
       [ 0.81509524, -0.37203706],
       [ 0.24595768, -0.2685244 ],
       [ 0.16641322, -0.68192672],
       [ 0.46480029, -0.67071154],
       [ 0.8908152 , -0.03446444],
       [ 0.23054802, -0.40438585],
       [-0.70453176, -1.01224823],
       [ 0.35698149, -0.50491009],
       [ 0.33193448, -0.21265468],
       [ 0.37621565, -0.29321893],
       [ 0.64257601,  0.01773819],
       [-0.90646986, -0.75609337],
       [ 0.29900084, -0.34889781],
       [ 2.53119273, -0.00984911],
       [ 1.41523588, -0.57491635],
       [ 2.61667602,  0.34390315],
       [ 1.97153105, -0.1797279 ],
       [ 2.35000592, -0.04026095],
       [ 3.39703874,  0.55083667],
       [ 0.52123224, -1.19275873],
       [ 2.93258707,  0.3555    ],
       [ 2.32122882, -0.2438315 ],
       [ 2.91675097,  0.78279195],
       [ 1.66177415,  0.24222841],
       [ 1.80340195, -0.21563762],
       [ 2.1655918 ,  0.21627559],
       [ 1.34616358, -0.77681835],
       [ 1.58592822, -0.53964071],
       [ 1.90445637,  0.11925069],
       [ 1.94968906,  0.04194326],
       [ 3.48705536,  1.17573933],
       [ 3.79564542,  0.25732297],
       [ 1.30079171, -0.76114964],
       [ 2.42781791,  0.37819601],
       [ 1.19900111, -0.60609153],
       [ 3.49992004,  0.4606741 ],
       [ 1.38876613, -0.20439933],
       [ 2.2754305 ,  0.33499061],
       [ 2.61409047,  0.56090136],
       [ 1.25850816, -0.17970479],
       [ 1.29113206, -0.11666865],
       [ 2.12360872, -0.20972948],
       [ 2.38800302,  0.4646398 ],
       [ 2.84167278,  0.37526917],
       [ 3.23067366,  1.37416509],
       [ 2.15943764, -0.21727758],
       [ 1.44416124, -0.14341341],
       [ 1.78129481, -0.49990168],
       [ 3.07649993,  0.68808568],
       [ 2.14424331,  0.1400642 ],
       [ 1.90509815,  0.04930053],
       [ 1.16932634, -0.16499026],
       [ 2.10761114,  0.37228787],
       [ 2.31415471,  0.18365128],
       [ 1.9222678 ,  0.40920347],
       [ 1.41523588, -0.57491635],
       [ 2.56301338,  0.2778626 ],
       [ 2.41874618,  0.3047982 ],
       [ 1.94410979,  0.1875323 ],
       [ 1.52716661, -0.37531698],
       [ 1.76434572,  0.07885885],
       [ 1.90094161,  0.11662796],
       [ 1.39018886, -0.28266094]])

#Visualising the clusters
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'red', label = 'Centroids')

plt.legend()

<matplotlib.legend.Legend at 0x7f6cfc1dde10>

# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'red', label = 'Centroids')
plt.show()

Tugas 6 : Decision Tree (Pohon Keputusan)#

from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score

iris = pd.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/iris.csv")

iris

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

150 rows × 5 columns

from sklearn import tree
X = [[0, 0], [1, 1]]
Y = [0, 1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

clf.predict([[2., 2.]])

array([1])

clf.predict_proba([[2., 2.]])

array([[0., 1.]])

from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
X, y = iris.data, iris.target
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

tree.plot_tree(clf)

[Text(0.5, 0.9166666666666666, 'X[3] <= 0.8\ngini = 0.667\nsamples = 150\nvalue = [50, 50, 50]'),
 Text(0.4230769230769231, 0.75, 'gini = 0.0\nsamples = 50\nvalue = [50, 0, 0]'),
 Text(0.5769230769230769, 0.75, 'X[3] <= 1.75\ngini = 0.5\nsamples = 100\nvalue = [0, 50, 50]'),
 Text(0.3076923076923077, 0.5833333333333334, 'X[2] <= 4.95\ngini = 0.168\nsamples = 54\nvalue = [0, 49, 5]'),
 Text(0.15384615384615385, 0.4166666666666667, 'X[3] <= 1.65\ngini = 0.041\nsamples = 48\nvalue = [0, 47, 1]'),
 Text(0.07692307692307693, 0.25, 'gini = 0.0\nsamples = 47\nvalue = [0, 47, 0]'),
 Text(0.23076923076923078, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]'),
 Text(0.46153846153846156, 0.4166666666666667, 'X[3] <= 1.55\ngini = 0.444\nsamples = 6\nvalue = [0, 2, 4]'),
 Text(0.38461538461538464, 0.25, 'gini = 0.0\nsamples = 3\nvalue = [0, 0, 3]'),
 Text(0.5384615384615384, 0.25, 'X[2] <= 5.45\ngini = 0.444\nsamples = 3\nvalue = [0, 2, 1]'),
 Text(0.46153846153846156, 0.08333333333333333, 'gini = 0.0\nsamples = 2\nvalue = [0, 2, 0]'),
 Text(0.6153846153846154, 0.08333333333333333, 'gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]'),
 Text(0.8461538461538461, 0.5833333333333334, 'X[2] <= 4.85\ngini = 0.043\nsamples = 46\nvalue = [0, 1, 45]'),
 Text(0.7692307692307693, 0.4166666666666667, 'X[1] <= 3.1\ngini = 0.444\nsamples = 3\nvalue = [0, 1, 2]'),
 Text(0.6923076923076923, 0.25, 'gini = 0.0\nsamples = 2\nvalue = [0, 0, 2]'),
 Text(0.8461538461538461, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]'),
 Text(0.9230769230769231, 0.4166666666666667, 'gini = 0.0\nsamples = 43\nvalue = [0, 0, 43]')]

import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris")

'iris.pdf'

dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=iris.feature_names,  
                      class_names=iris.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
iris = load_iris()
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(iris.data, iris.target)
r = export_text(decision_tree, feature_names=iris['feature_names'])
print(r)

|--- petal width (cm) <= 0.80
|   |--- class: 0
|--- petal width (cm) >  0.80
|   |--- petal width (cm) <= 1.75
|   |   |--- class: 1
|   |--- petal width (cm) >  1.75
|   |   |--- class: 2

UTS#

Lakukan analisa terhadap data pada https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Coimbra dengan menggunakan klasifikasi

metode Naive Bayes Classifier
metode pohon keputusan (Desision Tree)

1. Metode Naive Bayes Classifier#

# Naive Bayes Classification

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd

dataR2="https://raw.githubusercontent.com/Rosita19/datamining/main/dataR2.csv"
data = pd.read_csv(dataR2)

data

	Age	BMI	Glucose	Insulin	HOMA	Leptin	Adiponectin	Resistin	MCP.1	Classification
0	48	23.500000	70	2.707	0.467409	8.8071	9.702400	7.99585	417.114	1
1	83	20.690495	92	3.115	0.706897	8.8438	5.429285	4.06405	468.786	1
2	82	23.124670	91	4.498	1.009651	17.9393	22.432040	9.27715	554.697	1
3	68	21.367521	77	3.226	0.612725	9.8827	7.169560	12.76600	928.220	1
4	86	21.111111	92	3.549	0.805386	6.6994	4.819240	10.57635	773.920	1
...	...	...	...	...	...	...	...	...	...	...
111	45	26.850000	92	3.330	0.755688	54.6800	12.100000	10.96000	268.230	2
112	62	26.840000	100	4.530	1.117400	12.4500	21.420000	7.32000	330.160	2
113	65	32.050000	97	5.730	1.370998	61.4800	22.540000	10.33000	314.050	2
114	72	25.590000	82	2.820	0.570392	24.9600	33.750000	3.27000	392.460	2
115	86	27.180000	138	19.910	6.777364	90.2800	14.110000	4.35000	90.090	2

116 rows × 10 columns

data.shape

(116, 10)

#Pilih data menjadi variabel independen 'X' dan variabel 'y'
X = data.iloc[:,:9].values 
y = data['Classification'].values

# Memisahkan datauts ke dalam set Pelatihan dan set Testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitur Scaling untuk membawa variabel dalam satu skala
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Memasang Klasifikasi Naive Bayes ke set train dengan kernel linier
from sklearn.naive_bayes import GaussianNB
nvclassifier = GaussianNB()
nvclassifier.fit(X_train, y_train)

GaussianNB()

# Memprediksi hasil set Tes
y_pred = nvclassifier.predict(X_test)
print(y_pred)

[1 2 1 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1 1 1 2 2 1 2]

# nilai aktual dan prediksi
y_compare = np.vstack((y_test,y_pred)).T
#nilai aktual di sisi kiri dan nilai prediksi di sisi kanan
#mencetak 10 nilai teratas
y_compare[:10,:]

array([[1, 1],
       [2, 2],
       [2, 1],
       [1, 1],
       [1, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 1],
       [2, 1]])

# Membuat confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[7 4]
 [7 6]]

#Mencetak akurasi dengan confusion matrix
a = cm.shape
corrPred = 0
falsePred = 0

for row in range(a[0]):
    for c in range(a[1]):
        if row == c:
            corrPred +=cm[row,c]
        else:
            falsePred += cm[row,c]
print('Correct predictions: ', corrPred)
print('False predictions', falsePred)
print ('\n\nAccuracy of the Naive Bayes Clasification is: ', corrPred/(cm.sum()))    

Correct predictions:  13
False predictions 11


Accuracy of the Naive Bayes Clasification is:  0.5416666666666666

2. Metode Decision Tree#

Naive Bayes Classifier merupakan sebuah metoda klasifikasi yang berakar pada teorema Bayes . Metode pengklasifikasian dg menggunakan metode probabilitas dan statistik yg dikemukakan oleh ilmuwan Inggris Thomas Bayes , yaitu memprediksi peluang di masa depan berdasarkan pengalaman di masa sebelumnya sehingga dikenal sebagai Teorema Bayes . Ciri utama dr Naïve Bayes Classifier ini adalah asumsi yg sangat kuat (naïf) akan independensi dari masing-masing kondisi / kejadian.

Decision tree adalah algoritma machine learning yang menggunakan seperangkat aturan untuk membuat keputusan dengan struktur seperti pohon yang memodelkan kemungkinan hasil, biaya sumber daya, utilitas dan kemungkinan konsekuensi atau resiko. Konsepnya adalah dengan cara menyajikan algoritma dengan pernyataan bersyarat, yang meliputi cabang untuk mewakili langkah-langkah pengambilan keputusan yang dapat mengarah pada hasil yang menguntungkan.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numba
import cv2 as cv

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

dataR2="https://raw.githubusercontent.com/Rosita19/datamining/main/dataR2.csv"
data = pd.read_csv(dataR2)

data

	Age	BMI	Glucose	Insulin	HOMA	Leptin	Adiponectin	Resistin	MCP.1	Classification
0	48	23.500000	70	2.707	0.467409	8.8071	9.702400	7.99585	417.114	1
1	83	20.690495	92	3.115	0.706897	8.8438	5.429285	4.06405	468.786	1
2	82	23.124670	91	4.498	1.009651	17.9393	22.432040	9.27715	554.697	1
3	68	21.367521	77	3.226	0.612725	9.8827	7.169560	12.76600	928.220	1
4	86	21.111111	92	3.549	0.805386	6.6994	4.819240	10.57635	773.920	1
...	...	...	...	...	...	...	...	...	...	...
111	45	26.850000	92	3.330	0.755688	54.6800	12.100000	10.96000	268.230	2
112	62	26.840000	100	4.530	1.117400	12.4500	21.420000	7.32000	330.160	2
113	65	32.050000	97	5.730	1.370998	61.4800	22.540000	10.33000	314.050	2
114	72	25.590000	82	2.820	0.570392	24.9600	33.750000	3.27000	392.460	2
115	86	27.180000	138	19.910	6.777364	90.2800	14.110000	4.35000	90.090	2

116 rows × 10 columns

data.isnull().sum()

Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64

data.shape

(116, 10)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    int64  
dtypes: float64(7), int64(3)
memory usage: 9.2 KB

data.tail()

	Age	BMI	Glucose	Insulin	HOMA	Leptin	Adiponectin	Resistin	MCP.1	Classification
111	45	26.85	92	3.33	0.755688	54.68	12.10	10.96	268.23	2
112	62	26.84	100	4.53	1.117400	12.45	21.42	7.32	330.16	2
113	65	32.05	97	5.73	1.370998	61.48	22.54	10.33	314.05	2
114	72	25.59	82	2.82	0.570392	24.96	33.75	3.27	392.46	2
115	86	27.18	138	19.91	6.777364	90.28	14.11	4.35	90.09	2

data["Classification"].value_counts()

2    64
1    52
Name: Classification, dtype: int64

data=data.replace(to_replace='1',value=0)
data=data.replace(to_replace='2',value=1)

data

	Age	BMI	Glucose	Insulin	HOMA	Leptin	Adiponectin	Resistin	MCP.1	Classification
0	48	23.500000	70	2.707	0.467409	8.8071	9.702400	7.99585	417.114	1
1	83	20.690495	92	3.115	0.706897	8.8438	5.429285	4.06405	468.786	1
2	82	23.124670	91	4.498	1.009651	17.9393	22.432040	9.27715	554.697	1
3	68	21.367521	77	3.226	0.612725	9.8827	7.169560	12.76600	928.220	1
4	86	21.111111	92	3.549	0.805386	6.6994	4.819240	10.57635	773.920	1
...	...	...	...	...	...	...	...	...	...	...
111	45	26.850000	92	3.330	0.755688	54.6800	12.100000	10.96000	268.230	2
112	62	26.840000	100	4.530	1.117400	12.4500	21.420000	7.32000	330.160	2
113	65	32.050000	97	5.730	1.370998	61.4800	22.540000	10.33000	314.050	2
114	72	25.590000	82	2.820	0.570392	24.9600	33.750000	3.27000	392.460	2
115	86	27.180000	138	19.910	6.777364	90.2800	14.110000	4.35000	90.090	2

116 rows × 10 columns

data['Classification'].value_counts()

2    64
1    52
Name: Classification, dtype: int64

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    int64  
dtypes: float64(7), int64(3)
memory usage: 9.2 KB

X=data.iloc[:,1:-1]

	BMI	Glucose	Insulin	HOMA	Leptin	Adiponectin	Resistin	MCP.1
0	23.500000	70	2.707	0.467409	8.8071	9.702400	7.99585	417.114
1	20.690495	92	3.115	0.706897	8.8438	5.429285	4.06405	468.786
2	23.124670	91	4.498	1.009651	17.9393	22.432040	9.27715	554.697
3	21.367521	77	3.226	0.612725	9.8827	7.169560	12.76600	928.220
4	21.111111	92	3.549	0.805386	6.6994	4.819240	10.57635	773.920
...	...	...	...	...	...	...	...	...
111	26.850000	92	3.330	0.755688	54.6800	12.100000	10.96000	268.230
112	26.840000	100	4.530	1.117400	12.4500	21.420000	7.32000	330.160
113	32.050000	97	5.730	1.370998	61.4800	22.540000	10.33000	314.050
114	25.590000	82	2.820	0.570392	24.9600	33.750000	3.27000	392.460
115	27.180000	138	19.910	6.777364	90.2800	14.110000	4.35000	90.090

116 rows × 8 columns

Y=data.iloc[:,-1:]

	Classification
0	1
1	1
2	1
3	1
4	1
...	...
111	2
112	2
113	2
114	2
115	2

116 rows × 1 columns

X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.2, random_state=42)

giniindex=DecisionTreeClassifier(criterion='gini',max_depth=5,min_samples_leaf=3,random_state=100)

giniindex.fit(X_train,Y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, random_state=100)

y_pred=giniindex.predict(X_test)

confusion_matrix(Y_test,y_pred)

array([[10,  2],
       [ 0, 12]])

print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.86      1.00      0.92        12

    accuracy                           0.92        24
   macro avg       0.93      0.92      0.92        24
weighted avg       0.93      0.92      0.92        24

entropy_deci=DecisionTreeClassifier(criterion='entropy',max_depth=5,min_samples_leaf=3,random_state=100)

entropy_deci.fit(X_train,Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=3,
                       random_state=100)

y_pred_entropy=entropy_deci.predict(X_test)

confusion_matrix(Y_test,y_pred_entropy)

array([[10,  2],
       [ 0, 12]])

print(classification_report(Y_test,y_pred_entropy))

              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.86      1.00      0.92        12

    accuracy                           0.92        24
   macro avg       0.93      0.92      0.92        24
weighted avg       0.93      0.92      0.92        24

from sklearn import tree

tree.plot_tree(giniindex)

[Text(0.47619047619047616, 0.9166666666666666, 'X[1] <= 91.5\ngini = 0.491\nsamples = 92\nvalue = [40, 52]'),
 Text(0.21428571428571427, 0.75, 'X[2] <= 3.793\ngini = 0.432\nsamples = 38\nvalue = [26, 12]'),
 Text(0.09523809523809523, 0.5833333333333334, 'X[6] <= 13.163\ngini = 0.375\nsamples = 8\nvalue = [2, 6]'),
 Text(0.047619047619047616, 0.4166666666666667, 'gini = 0.5\nsamples = 4\nvalue = [2, 2]'),
 Text(0.14285714285714285, 0.4166666666666667, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'),
 Text(0.3333333333333333, 0.5833333333333334, 'X[6] <= 14.227\ngini = 0.32\nsamples = 30\nvalue = [24, 6]'),
 Text(0.23809523809523808, 0.4166666666666667, 'X[2] <= 14.391\ngini = 0.111\nsamples = 17\nvalue = [16, 1]'),
 Text(0.19047619047619047, 0.25, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]'),
 Text(0.2857142857142857, 0.25, 'gini = 0.444\nsamples = 3\nvalue = [2, 1]'),
 Text(0.42857142857142855, 0.4166666666666667, 'X[0] <= 31.124\ngini = 0.473\nsamples = 13\nvalue = [8, 5]'),
 Text(0.38095238095238093, 0.25, 'X[5] <= 7.537\ngini = 0.278\nsamples = 6\nvalue = [1, 5]'),
 Text(0.3333333333333333, 0.08333333333333333, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'),
 Text(0.42857142857142855, 0.08333333333333333, 'gini = 0.444\nsamples = 3\nvalue = [1, 2]'),
 Text(0.47619047619047616, 0.25, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]'),
 Text(0.7380952380952381, 0.75, 'X[4] <= 7.24\ngini = 0.384\nsamples = 54\nvalue = [14, 40]'),
 Text(0.6904761904761905, 0.5833333333333334, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.7857142857142857, 0.5833333333333334, 'X[4] <= 55.591\ngini = 0.338\nsamples = 51\nvalue = [11, 40]'),
 Text(0.6666666666666666, 0.4166666666666667, 'X[6] <= 11.927\ngini = 0.268\nsamples = 44\nvalue = [7, 37]'),
 Text(0.5714285714285714, 0.25, 'X[6] <= 8.31\ngini = 0.386\nsamples = 23\nvalue = [6, 17]'),
 Text(0.5238095238095238, 0.08333333333333333, 'gini = 0.153\nsamples = 12\nvalue = [1, 11]'),
 Text(0.6190476190476191, 0.08333333333333333, 'gini = 0.496\nsamples = 11\nvalue = [5, 6]'),
 Text(0.7619047619047619, 0.25, 'X[5] <= 3.924\ngini = 0.091\nsamples = 21\nvalue = [1, 20]'),
 Text(0.7142857142857143, 0.08333333333333333, 'gini = 0.444\nsamples = 3\nvalue = [1, 2]'),
 Text(0.8095238095238095, 0.08333333333333333, 'gini = 0.0\nsamples = 18\nvalue = [0, 18]'),
 Text(0.9047619047619048, 0.4166666666666667, 'X[5] <= 7.721\ngini = 0.49\nsamples = 7\nvalue = [4, 3]'),
 Text(0.8571428571428571, 0.25, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.9523809523809523, 0.25, 'gini = 0.375\nsamples = 4\nvalue = [1, 3]')]

tree.plot_tree(entropy_deci)

[Text(0.5769230769230769, 0.9166666666666666, 'X[1] <= 91.5\nentropy = 0.988\nsamples = 92\nvalue = [40, 52]'),
 Text(0.38461538461538464, 0.75, 'X[0] <= 31.124\nentropy = 0.9\nsamples = 38\nvalue = [26, 12]'),
 Text(0.3076923076923077, 0.5833333333333334, 'X[6] <= 13.248\nentropy = 0.991\nsamples = 27\nvalue = [15, 12]'),
 Text(0.15384615384615385, 0.4166666666666667, 'X[2] <= 3.793\nentropy = 0.672\nsamples = 17\nvalue = [14, 3]'),
 Text(0.07692307692307693, 0.25, 'entropy = 1.0\nsamples = 4\nvalue = [2, 2]'),
 Text(0.23076923076923078, 0.25, 'X[2] <= 6.83\nentropy = 0.391\nsamples = 13\nvalue = [12, 1]'),
 Text(0.15384615384615385, 0.08333333333333333, 'entropy = 0.0\nsamples = 10\nvalue = [10, 0]'),
 Text(0.3076923076923077, 0.08333333333333333, 'entropy = 0.918\nsamples = 3\nvalue = [2, 1]'),
 Text(0.46153846153846156, 0.4166666666666667, 'X[4] <= 28.041\nentropy = 0.469\nsamples = 10\nvalue = [1, 9]'),
 Text(0.38461538461538464, 0.25, 'entropy = 0.0\nsamples = 7\nvalue = [0, 7]'),
 Text(0.5384615384615384, 0.25, 'entropy = 0.918\nsamples = 3\nvalue = [1, 2]'),
 Text(0.46153846153846156, 0.5833333333333334, 'entropy = 0.0\nsamples = 11\nvalue = [11, 0]'),
 Text(0.7692307692307693, 0.75, 'X[4] <= 7.24\nentropy = 0.826\nsamples = 54\nvalue = [14, 40]'),
 Text(0.6923076923076923, 0.5833333333333334, 'entropy = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.8461538461538461, 0.5833333333333334, 'X[1] <= 111.0\nentropy = 0.752\nsamples = 51\nvalue = [11, 40]'),
 Text(0.7692307692307693, 0.4166666666666667, 'X[4] <= 63.703\nentropy = 0.839\nsamples = 41\nvalue = [11, 30]'),
 Text(0.6923076923076923, 0.25, 'X[6] <= 20.361\nentropy = 0.742\nsamples = 38\nvalue = [8, 30]'),
 Text(0.6153846153846154, 0.08333333333333333, 'entropy = 0.863\nsamples = 28\nvalue = [8, 20]'),
 Text(0.7692307692307693, 0.08333333333333333, 'entropy = 0.0\nsamples = 10\nvalue = [0, 10]'),
 Text(0.8461538461538461, 0.25, 'entropy = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.9230769230769231, 0.4166666666666667, 'entropy = 0.0\nsamples = 10\nvalue = [0, 10]')]

Tugas 7 : CREDIT RISK MODELING#

# Import library yang diperlukan
import pandas as pd
import numpy as np
from sklearn import preprocessing

# Membaca dataset kredit
dataset = pd.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/credit_score.csv")

# Menampilkan data skor kredit
dataset.head()

	Unnamed: 0	kode_kontrak	pendapatan_setahun_juta	kpr_aktif	durasi_pinjaman_bulan	jumlah_tanggungan	rata_rata_overdue	risk_rating
0	1	AGR-000001	295	YA	48	5	61 - 90 days	4
1	2	AGR-000011	271	YA	36	5	61 - 90 days	4
2	3	AGR-000030	159	TIDAK	12	0	0 - 30 days	1
3	4	AGR-000043	210	YA	12	3	46 - 60 days	3
4	5	AGR-000049	165	TIDAK	36	0	31 - 45 days	2

# Melihat jumlah baris dan kolom
dataset.shape

(900, 8)

Mengubah data kategorikal menjadi numerik menggunakan teknik One-Hot Encoding

# Mengambil kolom kpr aktif dan mentranformasikan menggunakan one-hot encoding
df_kpr_aktif=pd.get_dummies(dataset['kpr_aktif'])
df_kpr_aktif.head()

	TIDAK	YA
0	0	1
1	0	1
2	1	0
3	0	1
4	1	0

# Mengambil kolom rata-rata overdue mentranformasi menggunakan one-hot encoding
rata_rata_overdue=pd.get_dummies(dataset['rata_rata_overdue'])
rata_rata_overdue.head()

	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days
0	0	0	0	1
1	0	0	0	1
2	1	0	0	0
3	0	0	1	0
4	0	1	0	0

# Mengambil data numeric
numeric = pd.DataFrame(dataset, columns = ['kode_kontrak','pendapatan_setahun_juta','durasi_pinjaman_bulan','jumlah_tanggungan','risk_rating'])
numeric.head()

	kode_kontrak	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	risk_rating
0	AGR-000001	295	48	5	4
1	AGR-000011	271	36	5	4
2	AGR-000030	159	12	0	1
3	AGR-000043	210	12	3	3
4	AGR-000049	165	36	0	2

# Menampilkan gabungan beberapa kolom yang telah diproses
dataset_baru = pd.concat([numeric, df_kpr_aktif, rata_rata_overdue], axis=1)
dataset_baru.head()

	kode_kontrak	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	risk_rating	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days
0	AGR-000001	295	48	5	4	0	1	0	0	0	1
1	AGR-000011	271	36	5	4	0	1	0	0	0	1
2	AGR-000030	159	12	0	1	1	0	1	0	0	0
3	AGR-000043	210	12	3	3	0	1	0	0	1	0
4	AGR-000049	165	36	0	2	1	0	0	1	0	0

# Mengambil kolom selain kode_kontrak dan risk_rating dan melakukan normalisasi data
normalisasi = dataset_baru.drop(["kode_kontrak", "risk_rating"], axis=1)

Normalisasi data menggunakan Min Max

from sklearn.preprocessing import MinMaxScaler
# Melakukan scaler fitur
scaler = MinMaxScaler()
model =scaler.fit(normalisasi)
scaled_data=model.transform(normalisasi)
# Menampilkan scaler fitur
print(scaled_data)

[[0.97826087 1.         0.83333333 ... 0.         1.         0.        ]
 [0.87391304 0.66666667 0.83333333 ... 0.         1.         0.        ]
 [0.38695652 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.4173913  0.33333333 0.33333333 ... 0.         0.         0.        ]
 [0.54782609 1.         0.         ... 0.         0.         0.        ]
 [0.5826087  0.33333333 0.33333333 ... 0.         0.         0.        ]]

# Menampilkan data normalisasi dari min max
namakolom = normalisasi.columns.values
dataMinMax = pd.DataFrame(scaled_data, columns=namakolom)
dataMinMax.head()

	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days
0	0.978261	1.000000	0.833333	0.0	1.0	0.0	0.0	0.0	1.0
1	0.873913	0.666667	0.833333	0.0	1.0	0.0	0.0	0.0	1.0
2	0.386957	0.000000	0.000000	1.0	0.0	1.0	0.0	0.0	0.0
3	0.608696	0.000000	0.500000	0.0	1.0	0.0	0.0	1.0	0.0
4	0.413043	0.666667	0.000000	1.0	0.0	0.0	1.0	0.0	0.0

Normalisasi dengan Min=1 dan Max=2

# Min Max Scale dengan Min = 1 dan Max =2
scaler = MinMaxScaler(feature_range=(1,2))
model =scaler.fit(normalisasi)
scaled_data2=model.transform(normalisasi)
# Menampilkan skala fitur
print(scaled_data2)

[[1.97826087 2.         1.83333333 ... 1.         2.         1.        ]
 [1.87391304 1.66666667 1.83333333 ... 1.         2.         1.        ]
 [1.38695652 1.         1.         ... 1.         1.         1.        ]
 ...
 [1.4173913  1.33333333 1.33333333 ... 1.         1.         1.        ]
 [1.54782609 2.         1.         ... 1.         1.         1.        ]
 [1.5826087  1.33333333 1.33333333 ... 1.         1.         1.        ]]

# Menampilkan data normalisasi dari min=1 dan max=2
dataMinMax2 = pd.DataFrame(scaled_data2, columns=normalisasi.columns.values)
dataMinMax2.head()

	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days	> 90 days
0	1.978261	2.000000	1.833333	1.0	2.0	1.0	1.0	1.0	2.0	1.0
1	1.873913	1.666667	1.833333	1.0	2.0	1.0	1.0	1.0	2.0	1.0
2	1.386957	1.000000	1.000000	2.0	1.0	2.0	1.0	1.0	1.0	1.0
3	1.608696	1.000000	1.500000	1.0	2.0	1.0	1.0	2.0	1.0	1.0
4	1.413043	1.666667	1.000000	2.0	1.0	1.0	2.0	1.0	1.0	1.0

Normalisasi dengan Z Score

# Melakukan normalisasi dengan z score atau standarscale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
model = (scaler.fit(normalisasi))
data_mean = (scaler.mean_)
scale_data = (scaler.transform(normalisasi))
print(scale_data)

[[ 2.54041987  1.32217147  1.03062105 ... -0.6912543   2.54950976
  -0.35949218]
 [ 2.07740679  0.4439764   1.03062105 ... -0.6912543   2.54950976
  -0.35949218]
 [-0.08332092 -1.31241375 -1.46147714 ... -0.6912543  -0.39223227
  -0.35949218]
 ...
 [ 0.05172456 -0.43421867 -0.46463786 ... -0.6912543  -0.39223227
  -0.35949218]
 [ 0.63049091  1.32217147 -1.46147714 ... -0.6912543  -0.39223227
  -0.35949218]
 [ 0.78482861 -0.43421867 -0.46463786 ... -0.6912543  -0.39223227
  -0.35949218]]

# Menampilkan data normalisasi dari z score
dataZScale = pd.DataFrame(scale_data, columns=normalisasi.columns.values)
dataZScale.head()

	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days	> 90 days
0	2.540420	1.322171	1.030621	-0.868554	0.868554	-0.580772	-0.463222	-0.691254	2.549510	-0.359492
1	2.077407	0.443976	1.030621	-0.868554	0.868554	-0.580772	-0.463222	-0.691254	2.549510	-0.359492
2	-0.083321	-1.312414	-1.461477	1.151339	-1.151339	1.721847	-0.463222	-0.691254	-0.392232	-0.359492
3	0.900582	-1.312414	0.033782	-0.868554	0.868554	-0.580772	-0.463222	1.446646	-0.392232	-0.359492
4	0.032432	0.443976	-1.461477	1.151339	-1.151339	-0.580772	2.158791	-0.691254	-0.392232	-0.359492

Menggabungkan kolom yang telah dinormalisasi

# Mengambil kolom kode kontak dan risk rating
data_kontrak_risk= pd.DataFrame(dataset, columns=['kode_kontrak','risk_rating'])

# Menggabungkan kolom yang sudah dinormalisasi dan data sebelumnya
kredit_min_max = pd.concat([data_kontrak_risk, dataMinMax], axis=1)
kredit_min_max.head()

	kode_kontrak	risk_rating	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days
0	AGR-000001	4	0.978261	1.000000	0.833333	0.0	1.0	0.0	0.0	0.0	1.0
1	AGR-000011	4	0.873913	0.666667	0.833333	0.0	1.0	0.0	0.0	0.0	1.0
2	AGR-000030	1	0.386957	0.000000	0.000000	1.0	0.0	1.0	0.0	0.0	0.0
3	AGR-000043	3	0.608696	0.000000	0.500000	0.0	1.0	0.0	0.0	1.0	0.0
4	AGR-000049	2	0.413043	0.666667	0.000000	1.0	0.0	0.0	1.0	0.0	0.0

# Menggabungkan kolom yang sudah dinormalisasi dan data sebelumnya
kredit_min1_max2 = pd.concat([data_kontrak_risk, dataMinMax2], axis=1)
kredit_min1_max2.head()

	kode_kontrak	risk_rating	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days	> 90 days
0	AGR-000001	4	1.978261	2.000000	1.833333	1.0	2.0	1.0	1.0	1.0	2.0	1.0
1	AGR-000011	4	1.873913	1.666667	1.833333	1.0	2.0	1.0	1.0	1.0	2.0	1.0
2	AGR-000030	1	1.386957	1.000000	1.000000	2.0	1.0	2.0	1.0	1.0	1.0	1.0
3	AGR-000043	3	1.608696	1.000000	1.500000	1.0	2.0	1.0	1.0	2.0	1.0	1.0
4	AGR-000049	2	1.413043	1.666667	1.000000	2.0	1.0	1.0	2.0	1.0	1.0	1.0

# Menggabungkan kolom yang sudah dinormalisasi Z score dan data sebelumnya
kredit_Zscore = pd.concat([data_kontrak_risk, dataZScale], axis=1)
kredit_Zscore.head()

	kode_kontrak	risk_rating	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	TIDAK	YA	0 - 30 days	31 - 45 days	46 - 60 days	61 - 90 days	> 90 days
0	AGR-000001	4	2.540420	1.322171	1.030621	-0.868554	0.868554	-0.580772	-0.463222	-0.691254	2.549510	-0.359492
1	AGR-000011	4	2.077407	0.443976	1.030621	-0.868554	0.868554	-0.580772	-0.463222	-0.691254	2.549510	-0.359492
2	AGR-000030	1	-0.083321	-1.312414	-1.461477	1.151339	-1.151339	1.721847	-0.463222	-0.691254	-0.392232	-0.359492
3	AGR-000043	3	0.900582	-1.312414	0.033782	-0.868554	0.868554	-0.580772	-0.463222	1.446646	-0.392232	-0.359492
4	AGR-000049	2	0.032432	0.443976	-1.461477	1.151339	-1.151339	-0.580772	2.158791	-0.691254	-0.392232	-0.359492

Membagi data menjadi data training dan melakukan pengujian di Min Max

# Mengambil kelas dan fitur dari dataset
# fiturnya
X_min_max = kredit_min_max.iloc[:,2:12].values
# classnya
y_min_max = kredit_min_max.iloc[:,1].values

# Membagi data menjadi data training dan data uji dengan data uji berjumlah 30%
from sklearn.model_selection import train_test_split
X_trainn_min_max, X_testn_min_max, y_trainn_min_max, y_testn_min_max = train_test_split(X_min_max, y_min_max, test_size=0.30, random_state=0, stratify=y_min_max)

Membagi data menjadi training dan uji di min=1 dan max=2

# Mengambil kelas dan fitur dari dataset
# fiturnya
X_min1_max2 = kredit_min1_max2.iloc[:,2:12].values
# classnya
y_min1_max2 = kredit_min1_max2.iloc[:,1].values

# Membagi data menjadi data training dan data uji dengan data uji berjumlah 30%
X_trainn_min1_max2, X_testn_min1_max2, y_trainn_min1_max2, y_testn_min1_max2 = train_test_split(X_min1_max2, y_min1_max2, test_size=0.30, random_state=0, stratify=y_min1_max2)

Membagi data menjadi data training dan uji di Z Score

# Mengambil kelas dan fitur dari dataset
# fiturnya
X_Zscore = kredit_Zscore.iloc[:,2:12].values
# classnya
y_Zscore = kredit_Zscore.iloc[:,1].values

# Membagi data menjadi data training dan data uji dengan data uji berjumlah 30%
X_trainn_Zscore, X_testn_Zscore, y_trainn_Zscore, y_testn_Zscore = train_test_split(X_Zscore, y_Zscore, test_size=0.30, random_state=0, stratify=y_Zscore)

Naive Bayes

# Melakukan import library yang diperlukan
from sklearn.metrics import make_scorer, accuracy_score,precision_score
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold,train_test_split,cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

Naive Bayes dengan normalisasi Min Max

# Menghitung akurasi presisi dari naive bayes dengan normalisasi min max
gaussian = GaussianNB()
gaussian.fit(X_trainn_min_max, y_trainn_min_max)
Y_predn_min_max = gaussian.predict(X_testn_min_max) 
accuracy_n_min_max=round(accuracy_score(y_testn_min_max,Y_predn_min_max)* 100, 2)
acc_gaussian = round(gaussian.score(X_trainn_min_max, y_trainn_min_max) * 100, 2)

confusion_m_min_max = confusion_matrix(y_testn_min_max, Y_predn_min_max)
accuracy_n_min_max = accuracy_score(y_testn_min_max,Y_predn_min_max)
precision_n_min_max =precision_score(y_testn_min_max, Y_predn_min_max,average='micro')
recall_n_min_max =  recall_score(y_testn_min_max, Y_predn_min_max,average='micro')
f1_n_min_max = f1_score(y_testn_min_max,Y_predn_min_max,average='micro')
print('Confusion matrix untuk Naive Bayes\n',confusion_m_min_max)
print('Akurasi Naive Bayes: %.3f' %accuracy_n_min_max)
print('Precision Naive Bayes: %.3f' %precision_n_min_max)
print('Recall Naive Bayes: %.3f' %recall_n_min_max)
print('f1-score Naive Bayes : %.3f' %f1_n_min_max)

Confusion matrix untuk Naive Bayes
 [[68  0  0  0  0]
 [ 0 48  0  0  0]
 [ 0  0 87  0  0]
 [ 0  0  0 36  0]
 [ 0  0  0  0 31]]
Akurasi Naive Bayes: 1.000
Precision Naive Bayes: 1.000
Recall Naive Bayes: 1.000
f1-score Naive Bayes : 1.000

Naive Bayes dengan normalisasi Min=1 dan Max=2

# Menghitung akurasi presisi dari naive bayes dengan normalisasi min=1 dan max=2
gaussian = GaussianNB()
gaussian.fit(X_trainn_min1_max2, y_trainn_min1_max2)
Y_predn_min1_max2 = gaussian.predict(X_testn_min1_max2) 
accuracy_n_min1_max2=round(accuracy_score(y_testn_min1_max2,Y_predn_min1_max2)* 100, 2)
acc_gaussian = round(gaussian.score(X_trainn_min1_max2, y_trainn_min1_max2) * 100, 2)

confusion_m_min1_max2 = confusion_matrix(y_testn_min1_max2, Y_predn_min1_max2)
accuracy_n_min1_max2 = accuracy_score(y_testn_min1_max2,Y_predn_min1_max2)
precision_n_min1_max2 =precision_score(y_testn_min1_max2, Y_predn_min1_max2,average='micro')
recall_n_min1_max2 =  recall_score(y_testn_min1_max2, Y_predn_min1_max2,average='micro')
f1_n_min1_max2 = f1_score(y_testn_min1_max2,Y_predn_min1_max2,average='micro')
print('Confusion matrix untuk Naive Bayes\n',confusion_m_min1_max2)
print('Akurasi Naive Bayes: %.3f' %accuracy_n_min1_max2)
print('Precision Naive Bayes: %.3f' %precision_n_min1_max2)
print('Recall Naive Bayes: %.3f' %recall_n_min1_max2)
print('f1-score Naive Bayes : %.3f' %f1_n_min1_max2)

Confusion matrix untuk Naive Bayes
 [[68  0  0  0  0]
 [ 0 48  0  0  0]
 [ 0  0 87  0  0]
 [ 0  0  0 36  0]
 [ 0  0  0  0 31]]
Akurasi Naive Bayes: 1.000
Precision Naive Bayes: 1.000
Recall Naive Bayes: 1.000
f1-score Naive Bayes : 1.000

Naive Bayes dengan normalisasi Z Score

# Menghitung akurasi presisi dari naive bayes dengan normalisasi z score
gaussian = GaussianNB()
gaussian.fit(X_trainn_Zscore, y_trainn_Zscore)
Y_predn_Zscore = gaussian.predict(X_testn_Zscore) 
accuracy_n_Zscore=round(accuracy_score(y_testn_Zscore,Y_predn_Zscore)* 100, 2)
acc_gaussian = round(gaussian.score(X_trainn_Zscore, y_trainn_Zscore) * 100, 2)

confusion_m_Zscore = confusion_matrix(y_testn_Zscore, Y_predn_Zscore)
accuracy_n_Zscore = accuracy_score(y_testn_Zscore,Y_predn_Zscore)
precision_n_Zscore =precision_score(y_testn_Zscore, Y_predn_Zscore,average='micro')
recall_n_Zscore =  recall_score(y_testn_Zscore, Y_predn_Zscore,average='micro')
f1_n_Zscore = f1_score(y_testn_Zscore,Y_predn_Zscore,average='micro')
print('Confusion matrix untuk Naive Bayes\n',confusion_m_Zscore)
print('Akurasi Naive Bayes: %.3f' %accuracy_n_Zscore)
print('Precision Naive Bayes: %.3f' %precision_n_Zscore)
print('Recall Naive Bayes: %.3f' %recall_n_Zscore)
print('f1-score Naive Bayes : %.3f' %f1_n_Zscore)

Confusion matrix untuk Naive Bayes
 [[68  0  0  0  0]
 [ 0 48  0  0  0]
 [ 0  0 87  0  0]
 [ 0  0  0 36  0]
 [ 0  0  0  0 31]]
Akurasi Naive Bayes: 1.000
Precision Naive Bayes: 1.000
Recall Naive Bayes: 1.000
f1-score Naive Bayes : 1.000

KNN

KNN dengan normalisasi Min dan Max

# Menghitung akurasi dari KNN dengan normalisasi min dan max
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_trainn_min_max, y_trainn_min_max)

acc_knn = round(neigh.score(X_trainn_min_max, y_trainn_min_max) * 100, 2)
print("Akurasi KNN :",acc_knn)

Akurasi KNN : 99.68

KNN dengan normalisasi Min=1 dan Max=2

# Menghitung akurasi dari KNN dengan normalisasi min=1 dan max=2
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_trainn_min1_max2, y_trainn_min1_max2)

acc_knn = round(neigh.score(X_trainn_min1_max2, y_trainn_min1_max2) * 100, 2)
print("Akurasi KNN :",acc_knn)

Akurasi KNN : 99.68

KNN dengan normalisasi Z Score

# Menghitung akurasi dari KNN dengan normalisasi z score
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_trainn_Zscore, y_trainn_Zscore)

acc_knn = round(neigh.score(X_trainn_Zscore, y_trainn_Zscore) * 100, 2)
print("Akurasi KNN :",acc_knn)

Akurasi KNN : 100.0

Decision Treee

# Import library yang diperlukan pada decision tree
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics
from matplotlib import pyplot as plt

Decision Tree dengan normalisasi Min dan max

# Menghitung akurasi dengan menggunakan gini indek dengan normalisasi min max
clf = tree.DecisionTreeClassifier(criterion="gini")
clf = clf.fit(X_trainn_min_max, y_trainn_min_max)

y_predn_min_max = clf.predict(X_testn_min_max)
print("Accuracy_Decision Tree :",metrics.accuracy_score(y_testn_min_max,y_predn_min_max))

Accuracy_Decision Tree : 1.0

Decision Tree dengan normalisasi Min=1 dan Max=2

# Menghitung akurasi dengan menggunakan gini indek dengan normalisasi Min=1 dan Max=2
clf = tree.DecisionTreeClassifier(criterion="gini")
clf = clf.fit(X_trainn_min1_max2, y_trainn_min1_max2)

y_predn_min1_max2 = clf.predict(X_testn_min1_max2)
print("Accuracy_Decision Tree :",metrics.accuracy_score(y_testn_min1_max2, y_predn_min1_max2))

Accuracy_Decision Tree : 1.0

Decision Tree dengan normalisasi Z Score

# Menghitung akurasi dengan menggunakan gini indek dengan normalisasi Z Score
clf = tree.DecisionTreeClassifier(criterion="gini")
clf = clf.fit(X_trainn_Zscore, y_trainn_Zscore)

y_predn_Zscore = clf.predict(X_testn_Zscore)
print("Accuracy_Decision Tree :",metrics.accuracy_score(y_testn_Zscore,y_predn_Zscore))

Accuracy_Decision Tree : 1.0

# gambar bentuk decision tree
plt.figure(figsize=(15,15))
#Membuat plot
a = tree.plot_tree(clf,
                   rounded = True,
                   filled = True,
                   fontsize=8)
#Menampilkan plot
plt.show()

Tugas 8 : Bagging Esamble Learning#

# Import library yang diperlukan
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=10, random_state=0).fit(X_trainn_min_max, y_trainn_min_max)
rsb = clf.predict(X_testn_min_max)
b = ['Decision Tree']
Tree = pd.DataFrame(rsb,columns = b)

X_testn_min_max.shape

(270, 10)

K = 10
clf = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = K),n_estimators=10, random_state=0).fit(X_trainn_min_max, y_trainn_min_max)
rsa = clf.predict(X_testn_min_max)
a = ['KNN']
KNN = pd.DataFrame(rsa,columns = a)

clf = BaggingClassifier(base_estimator=GaussianNB(),n_estimators=10, random_state=0).fit(X_trainn_min_max, y_trainn_min_max)
rsc = clf.predict(X_testn_min_max)
c = ['Naive Bayes']
Bayes = pd.DataFrame(rsc,columns = c)

Result = pd.concat([Tree, KNN,Bayes], axis=1)
Result

	Decision Tree	KNN	Naive Bayes
0	4	4	4
1	3	3	3
2	1	1	1
3	3	3	3
4	3	3	3
...	...	...	...
265	1	1	1
266	4	4	4
267	1	1	1
268	3	3	3
269	3	3	3

270 rows × 3 columns

bagging_accuracy1 = round(100 * accuracy_score(y_testn_min_max, Bayes), 2)
bagging_accuracy2 = round(100 * accuracy_score(y_testn_min_max, Tree), 2)
bagging_accuracy3 = round(100 * accuracy_score(y_testn_min_max, KNN), 2)
print('The accuracy of this model is Bagging Naive Bayes {} %.'.format(bagging_accuracy1))
print('The accuracy of this model is Bagging Decision Tree {} %.'.format(bagging_accuracy2))
print('The accuracy of this model is Bagging kNN {} %.'.format(bagging_accuracy3))

The accuracy of this model is Bagging Naive Bayes 100.0 %.
The accuracy of this model is Bagging Decision Tree 100.0 %.
The accuracy of this model is Bagging kNN 99.63 %.

UAS#

import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/credit_score.csv")

df

	Unnamed: 0	kode_kontrak	pendapatan_setahun_juta	kpr_aktif	durasi_pinjaman_bulan	jumlah_tanggungan	rata_rata_overdue	risk_rating
0	1	AGR-000001	295	YA	48	5	61 - 90 days	4
1	2	AGR-000011	271	YA	36	5	61 - 90 days	4
2	3	AGR-000030	159	TIDAK	12	0	0 - 30 days	1
3	4	AGR-000043	210	YA	12	3	46 - 60 days	3
4	5	AGR-000049	165	TIDAK	36	0	31 - 45 days	2
...	...	...	...	...	...	...	...	...
895	896	AGR-010739	112	YA	48	5	> 90 days	5
896	897	AGR-010744	120	YA	48	2	46 - 60 days	3
897	898	AGR-010758	166	TIDAK	24	2	0 - 30 days	1
898	899	AGR-010775	196	TIDAK	48	0	31 - 45 days	2
899	900	AGR-010790	204	TIDAK	24	2	0 - 30 days	1

900 rows × 8 columns

#Exploration Data

df[["kode_kontrak", "pendapatan_setahun_juta", "kpr_aktif", "durasi_pinjaman_bulan", "jumlah_tanggungan", "rata_rata_overdue", "risk_rating"]].agg(['min','max'])

	kode_kontrak	pendapatan_setahun_juta	kpr_aktif	durasi_pinjaman_bulan	jumlah_tanggungan	rata_rata_overdue	risk_rating
min	AGR-000001	70	TIDAK	12	0	0 - 30 days	1
max	AGR-010790	300	YA	48	6	> 90 days	5

df.shape

(900, 8)

preprocessing data#

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

le.inverse_transform(y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

normalize data#

numeric = pd.DataFrame(df, columns = ['kode_kontrak','pendapatan_setahun_juta','durasi_pinjaman_bulan','jumlah_tanggungan','risk_rating'])
numeric.head()

	kode_kontrak	pendapatan_setahun_juta	durasi_pinjaman_bulan	jumlah_tanggungan	risk_rating
0	AGR-000001	295	48	5	4
1	AGR-000011	271	36	5	4
2	AGR-000030	159	12	0	1
3	AGR-000043	210	12	3	3
4	AGR-000049	165	36	0	2

#UAS KU

READ DATA

import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/Rosita19/datamining/main/healthcare-dataset-stroke-data.csv")
df

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	9046	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	51676	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	31112	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1
3	60182	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1
4	1665	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1
...	...	...	...	...	...	...	...	...	...	...	...	...
5105	18234	Female	80.0	1	0	Yes	Private	Urban	83.75	NaN	never smoked	0
5106	44873	Female	81.0	0	0	Yes	Self-employed	Urban	125.20	40.0	never smoked	0
5107	19723	Female	35.0	0	0	Yes	Self-employed	Rural	82.99	30.6	never smoked	0
5108	37544	Male	51.0	0	0	Yes	Private	Rural	166.29	25.6	formerly smoked	0
5109	44679	Female	44.0	0	0	Yes	Govt_job	Urban	85.28	26.2	Unknown	0

5110 rows × 12 columns

Exploration Data

df

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	9046	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	51676	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	31112	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1
3	60182	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1
4	1665	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1
...	...	...	...	...	...	...	...	...	...	...	...	...
5105	18234	Female	80.0	1	0	Yes	Private	Urban	83.75	NaN	never smoked	0
5106	44873	Female	81.0	0	0	Yes	Self-employed	Urban	125.20	40.0	never smoked	0
5107	19723	Female	35.0	0	0	Yes	Self-employed	Rural	82.99	30.6	never smoked	0
5108	37544	Male	51.0	0	0	Yes	Private	Rural	166.29	25.6	formerly smoked	0
5109	44679	Female	44.0	0	0	Yes	Govt_job	Urban	85.28	26.2	Unknown	0

5110 rows × 12 columns

df[["gender", "age", "hypertension", "heart_disease", "ever_married", "work_type", "Residence_type", "avg_glucose_level", "bmi", "smoking_status"]].agg(['min','max'])

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status
min	Female	0.08	0	0	No	Govt_job	Rural	55.12	10.3	Unknown
max	Other	82.00	1	1	Yes	children	Urban	271.74	97.6	smokes

df.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

Preprocessing Data

df = df.drop(columns="id")

X = df.drop(columns="stroke")
y = df.stroke

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
y

array([1, 1, 1, ..., 0, 0, 0])

le.inverse_transform(y)

array([1, 1, 1, ..., 0, 0, 0])

labels = pd.get_dummies(df.stroke).columns.values.tolist()
labels

[0, 1]

Normalisasi data

dataubah=df.drop(columns=['gender','ever_married','work_type','Residence_type','smoking_status'])

data_gen=df[['gender']]

gen = pd.get_dummies(data_gen)

data_married=df[['ever_married']]

married = pd.get_dummies(data_married)

data_work=df[['work_type']]

work = pd.get_dummies(data_work)

data_residence=df[['Residence_type']]

residence = pd.get_dummies(data_residence)

data_smoke=df[['smoking_status']]

smoke = pd.get_dummies(data_smoke)

data_bmi = df[['bmi']]

bmi = pd.get_dummies(data_bmi)

dataOlah = pd.concat([gen,married,work,residence,smoke,bmi], axis=1)

dataHasil = pd.concat([df,dataOlah], axis = 1)
dataHasil

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	...	work_type_Private	work_type_Self-employed	work_type_children	Residence_type_Rural	Residence_type_Urban	smoking_status_Unknown	smoking_status_formerly smoked	smoking_status_never smoked	smoking_status_smokes	bmi
0	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	...	1	0	0	0	1	0	1	0	0	36.6
1	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	...	0	1	0	1	0	0	0	1	0	NaN
2	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	...	1	0	0	1	0	0	0	1	0	32.5
3	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	...	1	0	0	0	1	0	0	0	1	34.4
4	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	...	0	1	0	1	0	0	0	1	0	24.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5105	Female	80.0	1	0	Yes	Private	Urban	83.75	NaN	never smoked	...	1	0	0	0	1	0	0	1	0	NaN
5106	Female	81.0	0	0	Yes	Self-employed	Urban	125.20	40.0	never smoked	...	0	1	0	0	1	0	0	1	0	40.0
5107	Female	35.0	0	0	Yes	Self-employed	Rural	82.99	30.6	never smoked	...	0	1	0	1	0	0	0	1	0	30.6
5108	Male	51.0	0	0	Yes	Private	Rural	166.29	25.6	formerly smoked	...	1	0	0	1	0	0	1	0	0	25.6
5109	Female	44.0	0	0	Yes	Govt_job	Urban	85.28	26.2	Unknown	...	0	0	0	0	1	1	0	0	0	26.2

5110 rows × 28 columns

X = dataHasil.drop(columns=["gender","ever_married","work_type","Residence_type","smoking_status","bmi"])
y = dataHasil.stroke

	age	hypertension	heart_disease	avg_glucose_level	stroke	gender_Female	gender_Male	gender_Other	ever_married_No	ever_married_Yes	...	work_type_Never_worked	work_type_Private	work_type_Self-employed	work_type_children	Residence_type_Rural	Residence_type_Urban	smoking_status_Unknown	smoking_status_formerly smoked	smoking_status_never smoked	smoking_status_smokes
0	67.0	0	1	228.69	1	0	1	0	0	1	...	0	1	0	0	0	1	0	1	0	0
1	61.0	0	0	202.21	1	1	0	0	0	1	...	0	0	1	0	1	0	0	0	1	0
2	80.0	0	1	105.92	1	0	1	0	0	1	...	0	1	0	0	1	0	0	0	1	0
3	49.0	0	0	171.23	1	1	0	0	0	1	...	0	1	0	0	0	1	0	0	0	1
4	79.0	1	0	174.12	1	1	0	0	0	1	...	0	0	1	0	1	0	0	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5105	80.0	1	0	83.75	0	1	0	0	0	1	...	0	1	0	0	0	1	0	0	1	0
5106	81.0	0	0	125.20	0	1	0	0	0	1	...	0	0	1	0	0	1	0	0	1	0
5107	35.0	0	0	82.99	0	1	0	0	0	1	...	0	0	1	0	1	0	0	0	1	0
5108	51.0	0	0	166.29	0	0	1	0	0	1	...	0	1	0	0	1	0	0	1	0	0
5109	44.0	0	0	85.28	0	1	0	0	0	1	...	0	0	0	0	0	1	1	0	0	0

5110 rows × 21 columns

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
X

array([[0.81689453, 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.74365234, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.97558594, 0.        , 1.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.42626953, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.62158203, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.53613281, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

X.shape, y.shape

((5110, 21), (5110,))

le.inverse_transform(y)

array([1, 1, 1, ..., 0, 0, 0])

labels = pd.get_dummies(dataHasil.stroke).columns.values.tolist()
labels

[0, 1]

Normalisasi MinMax Scaler

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
X

array([[0.81689453, 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.74365234, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.97558594, 0.        , 1.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.42626953, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.62158203, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.53613281, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

X.shape, y.shape

((5110, 21), (5110,))

Split Data

# membagi data menjadi data testing(20%) dan training(80%)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4088, 21), (1022, 21), (4088,), (1022,))

MODEl

from sklearn.neighbors import KNeighborsClassifier
from numpy import array

KNN

metode1 = KNeighborsClassifier(n_neighbors=3)
metode1.fit(X_train, y_train)
print(metode1.score(X_train, y_train))
print(metode1.score(X_test, y_test))
y_pred = metode1.predict(scaler.transform(array([[50.0,0,1,105.92,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0]])))
le.inverse_transform(y_pred)[0]

0.9907045009784736
0.9784735812133072

Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

metode2 = GaussianNB()
metode2.fit(X_train, y_train)
print(metode2.score(X_train, y_train))
print(metode2.score(X_test, y_test))
y_pred = metode2.predict(array([[50.0,0,1,105.92,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0]]))
le.inverse_transform(y_pred)[0]

1.0
1.0

Decision Tree

k = 3

karena mendapatkan nilai tertinggi

from sklearn import tree

metode3 = tree.DecisionTreeClassifier(criterion="gini")
metode3.fit(X_train, y_train)
print(metode3.score(X_train, y_train))
print(metode3.score(X_test, y_test))
y_pred = metode3.predict(array([[50.0,0,1,105.92,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0]]))
le.inverse_transform(y_pred)[0]

1.0
1.0

Eksport

Label Encoder
Scaler
Model

from sklearn.utils.validation import joblib
# label encoder
joblib.dump(le, "le.save") 

# scaler
joblib.dump(scaler, "scaler.save") 

# model
joblib.dump(metode1, "nb.joblib")
joblib.dump(metode2, "knn.joblib")
joblib.dump(metode3, "tree.joblib")

['tree.joblib']

Rosita Dewi Lutfiyah

PENUGASAN

Contents

PENUGASAN#

Tugas 1#

Tugas 2 : Diskritisasi#

Tugas 3 : KNN(K-Nearest Neighbor)#

Tugas 4 : NAIVE BAYES CLASSIFIER#

Tugas 5 : K-Means Clustering#

Tugas 6 : Decision Tree (Pohon Keputusan)#

UTS#

1. Metode Naive Bayes Classifier#

2. Metode Decision Tree#

Tugas 7 : CREDIT RISK MODELING#

Tugas 8 : Bagging Esamble Learning#

UAS#

preprocessing data#

normalize data#

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	B
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

	sepal.width	category
0	3.5	B
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	B
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	B
149	3.0	B

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	C
146	6.3	B
147	6.5	C
148	6.2	B
149	5.9	B

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

	sepal.width	category
0	3.5	C
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	C
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	C
149	3.0	B

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

	sepal.length	category
0	5.1	A
1	4.9	B
2	4.7	B
3	4.6	A
4	5.0	B
...	...	...
145	6.7	A
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

	Decision Tree	KNN	Naive Bayes
0	4	4	4
1	3	3	3
2	1	1	1
3	3	3	3
4	3	3	3
...	...	...	...
265	1	1	1
266	4	4	4
267	1	1	1
268	3	3	3
269	3	3	3

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	B
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

	sepal.width	category
0	3.5	B
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	B
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	B
149	3.0	B

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	C
146	6.3	B
147	6.5	C
148	6.2	B
149	5.9	B

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

	sepal.width	category
0	3.5	C
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	C
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	C
149	3.0	B

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

	sepal.length	category
0	5.1	A
1	4.9	B
2	4.7	B
3	4.6	A
4	5.0	B
...	...	...
145	6.7	A
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

	Decision Tree	KNN	Naive Bayes
0	4	4	4
1	3	3	3
2	1	1	1
3	3	3	3
4	3	3	3
...	...	...	...
265	1	1	1
266	4	4	4
267	1	1	1
268	3	3	3
269	3	3	3

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	B
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

	sepal.width	category
0	3.5	B
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	B
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	B
149	3.0	B

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

	sepal.length	category
0	5.1	A
1	4.9	A
2	4.7	A
3	4.6	A
4	5.0	A
...	...	...
145	6.7	C
146	6.3	B
147	6.5	C
148	6.2	B
149	5.9	B

	petal.length	category
0	1.4	A
1	1.4	A
2	1.3	A
3	1.5	A
4	1.4	A
...	...	...
145	5.2	C
146	5.0	C
147	5.2	C
148	5.4	C
149	5.1	C

	sepal.width	category
0	3.5	C
1	3.0	B
2	3.2	B
3	3.1	B
4	3.6	C
...	...	...
145	3.0	B
146	2.5	A
147	3.0	B
148	3.4	C
149	3.0	B

	petal.width	category
0	0.2	A
1	0.2	A
2	0.2	A
3	0.2	A
4	0.2	A
...	...	...
145	2.3	C
146	1.9	C
147	2.0	C
148	2.3	C
149	1.8	C

	sepal.length	category
0	5.1	A
1	4.9	B
2	4.7	B
3	4.6	A
4	5.0	B
...	...	...
145	6.7	A
146	6.3	B
147	6.5	B
148	6.2	B
149	5.9	B

	Decision Tree	KNN	Naive Bayes
0	4	4	4
1	3	3	3
2	1	1	1
3	3	3	3
4	3	3	3
...	...	...	...
265	1	1	1
266	4	4	4
267	1	1	1
268	3	3	3
269	3	3	3