Darthvaderkenya / captsone

1 stars 0 forks source link

Dealing with text #1

Closed rodneyosodo closed 1 year ago

rodneyosodo commented 5 years ago

Try out this to any columns with strings as the rows or of type object type

from sklearn.feature_extraction.text import TfidfVectorizer' vectorizer = TfidfVectorizer() tweets = vectorizer.fit_transform(data["tweets"])

for this once

created_date 114 non-null object agency 114 non-null object agency_name 114 non-null object descriptor 114 non-null object status 114 non-null object community_board 114 non-null object borough 114 non-null object open_data_channel_type 114 non-null object park_facility_name 114 non-null object park_borough 114 non-null object address_type 114 non-null object resolution_description 113 non-null object resolution_action_updated_date 114 non-null object closed_date 0 non-null object facility_type 0 non-null object taxi_company_borough 0 non-null object bridge_highway_direction 0 non-null object road_ramp 0 non-null object bridge_highway_segment 0 non-null object

then for location try this from sklearn.feature_extraction.text import CountVectorizer' vectorizer = CountVectorizer() tweets = vectorizer.fit_transform(data["tweets"])

location_type 114 non-null object incident_address 114 non-null object street_name 114 non-null object cross_street_1 0 non-null object cross_street_2 0 non-null object intersection_street_1 0 non-null object intersection_street_2 0 non-null object city 114 non-null object landmark 0 non-null object location 114 non-null object taxi_pick_up_location 0 non-null object

also this use count vectorizer complaint_type 114 non-null object

rodneyosodo commented 5 years ago

you can use this example for count vectorizer

from sklearn.feature_extraction.text import CountVectorizer'
vectorizer = CountVectorizer()
location_based = vectorizer.fit_transform(requests_df['location_type 114', 'incident_address', 'street_name', 'cross_street_1', 'cross_street_2', 'intersection_street_1', 'intersection_street_2', 'city', 'landmark', 'location', 'taxi_pick_up_location'])
rodneyosodo commented 5 years ago

for training

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

x_train, x_test, y_train, y_test = train_test_split(tweets, target, test_size=0.333, random_state=42)

BNB = BernoulliNB()
BNB.fit(x_train,y_train)
yhat = BNB.predict(x_test)
print("BernoulliNB")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, BNB.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# for saving the classifier you can use the code below
classifier_f = open("../Pickle/BernoulliNB.pickle", "wb")
pickle.dump(BNB, classifier_f)
classifier_f.close()

RForest = RandomForestClassifier()
RForest.fit(x_train, y_train)
yhat = RForest.predict(x_test)
print("RandomForestClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, RForest.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
classifier_f = open("../Pickle/RandomForestClassifier.pickle", "wb")
pickle.dump(RForest, classifier_f)
classifier_f.close()

#But for SVC it will take time
LSVC = LinearSVC()
LSVC.fit(x_train,y_train)
yhat = LSVC.predict(x_test)
print("LinearSVC")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, LSVC.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

DTree = DecisionTreeClassifier(max_depth=3)
DTree.fit(x_train, y_train)
yhat = DTree.predict(x_test)
print("DecisionTreeClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, DTree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
rodneyosodo commented 5 years ago

for tfidf

from sklearn.feature_extraction.text import TfidfVectorizer'
vectorizer = TfidfVectorizer()
non_location_based = vectorizer.fit_transform(requests_df['agency', 'agency_name','descriptor', 'status' , 'community_board', 'borough', 'open_data_channel_type', 'park_facility_name', 'park_borough', 'address_type', 'resolution_description', 'resolution_action_updated_date', 'closed_date', 'facility_type', 'taxi_company_borough', 'bridge_highway_direction', 'road_ramp', 'bridge_highway_segment'])
rodneyosodo commented 5 years ago
col_parameters = []
for col in col_parameters:
    full_df[col] = pd.to_datetime(full_df[col])
rodneyosodo commented 5 years ago

for col in train_df.columns:
    if col not in test_df.columns:
        print(col)
rodneyosodo commented 5 years ago
subject_vectorized = subject_transformer.fit_transform(full_docs['subject'])
rodneyosodo commented 5 years ago
regex_vectorized_df = pd.DataFrame(regex_vectorized.toarray())
rodneyosodo commented 5 years ago
full_df = pd.merge(full_df, riders_df, how='left', left_on='Rider_Id', right_on='Rider Id', left_index=True)