ChawlaAvi / Daily-Dose-of-Data-Science

A collection of code snippets from the publication Daily Dose of Data Science on Substack: http://www.dailydoseofds.com/
768 stars 172 forks source link

when i use the code, the print is always the iris.data' result although i change the data. #5

Open liariel opened 1 year ago

liariel commented 1 year ago

Hi ChawlaAvi, when i use the code, the print is always the iris.data' result although i change the data.

`import pandas as pd import numpy as np import interactive_decision_tree as idt ## local module from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('/Users/lee/Desktop/data-xy015.csv') X = data.iloc[:, 0:31] y = data.iloc[:, 38]

clf = DecisionTreeClassifier() clf = clf.fit(X, y)

idt.create_tree(tree_model=clf, X=X, target_names=np.unique(y), save_path='C:/Users/lee/Desktop/PY01/tree_template.html')

idt.create_sankey(tree_model=clf, X=X, target_names=np.unique(y), save_path='C:/Users/lee/Desktop/PY01/sankey_template.html') `

shreevaths1 commented 1 year ago

Have you used iris dataset in above code ? Does /Users/lee/Desktop/data-xy015.csv refer to iris dataset ?

liariel commented 1 year ago

Hi shreevaths1, I didn't use the iris dataset. "/Users/lee/Desktop/data-xy015.csv" is about my experiment data. I also custom decision tree model, the print is still the iris.data' result.

import pandas as pd import numpy as np from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.preprocessing import MinMaxScaler from sklearn.tree import DecisionTreeClassifier import interactive_decision_tree as idt

class MyDecisionTree: def init(self): self.tree = None

def fit(self, X, y):
    self.tree = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=4, min_samples_leaf=0.05, max_leaf_nodes=50)
    self.tree.fit(X, y)

def predict(self, X):
    return self.tree.predict(X)

def importdata(): balance_data = pd.read_csv('/Users/lee/Desktop/data-xy015.csv', sep=',', header=0) print("Dataset Length: ", len(balance_data)) print("Dataset Shape: ", balance_data.shape) print("Dataset: ", balance_data.head()) return balance_data

def splitdataset(balance_data): X = balance_data.values[:, 0:30] Y = balance_data.values[:, 38] scaler = MinMaxScaler() X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100) return X, Y, X_train, X_test, y_train, y_test

def train_using_gini(X_train, X_test, y_train): clf_gini = MyDecisionTree() clf_gini.fit(X_train, y_train) return clf_gini

def prediction(X_test, clf_object): y_pred = clf_object.predict(X_test) print("Predicted values:") print(y_pred) return y_pred

def cal_accuracy(y_test, y_pred): print("Confusion Matrix: ", confusion_matrix(y_test, y_pred)) print("Accuracy : ", accuracy_score(y_test, y_pred) * 100) print("Report : ", classification_report(y_test, y_pred))

def build_decision_tree(X, clf_object): feature_names = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30'] class_names = ['low', 'high'] idt.create_sankey(tree_model=clf_object.tree, X=X, target_names=class_names, save_path='/Users/lee/Desktop/tree_template1.html')

def main(): data = importdata() X, Y, X_train, X_test, y_train, y_test = splitdataset(data) clf_gini = train_using_gini(X_train, X_test, y_train) print("Results Using Gini Index:") y_pred_gini = prediction(X_test, clf_gini) cal_accuracy(y_test, y_pred_gini) build_decision_tree(X, clf_gini)

if name == "main": main()