dinarmalik37 / tugas_akhhir

prediksi penjualan bbm di spbu
0 stars 0 forks source link

modelling #2

Open dinarmalik37 opened 5 hours ago

dinarmalik37 commented 5 hours ago

from google.colab import drive drive.mount('/content/drives/', force_remount=True) import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns data = pd.read_csv('/content/PERTALITE_data.csv') data.head()

Jumlah Missing Value

jumlah_missing_value = data.isnull().sum().sum() print(f'Jumlah Missing Value: {jumlah_missing_value}')

Jumlah Duplikasi Data

jumlah_duplikasi = data.duplicated().sum() print(f'Jumlah Duplikasi Data: {jumlah_duplikasi}') Produk_to_num = { 'PERTALITE': 0, } data['Produk'] = data['Produk'].map(Produk_to_num)

Ubah seluruh dataset ke bentuk float64, kecuali kolom 'Transaction Date'

for col in data.columns: if col != 'Transaction Date': # Lewati konversi untuk kolom 'Transaction Date' data[col] = data[col].astype('float64')

data.info() print(data.head()) print(data.columns) data.head() from sklearn.preprocessing import MinMaxScaler from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error

Membuat objek scaler

scaler = MinMaxScaler()

Use the 'data' DataFrame instead of the undefined 'df_normalized'

Explicitly drop the 'Produk' column and handle potential errors

data_numeric = data.drop(['Transaction Date'], axis=1, errors='ignore')

Melakukan fit dan transform pada data numerik saja

df_normalized_numeric = pd.DataFrame(scaler.fit_transform(data_numeric), columns=data_numeric.columns)

If you want to include 'Transaction Date' and 'Produk' in the final dataframe, you can concatenate it back

df_normalized = pd.concat([data[['Transaction Date']], df_normalized_numeric], axis=1)

Menampilkan data yang telah dinormalisasi

print(df_normalized.head()) import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

Use the correct column name 'Transaction Date'

data['Transaction Date'] = pd.to_datetime(data['Transaction Date'])

Mengekstraksi fitur dari 'Transaction Date'

data['Year'] = data['Transaction Date'].dt.year data['Month'] = data['Transaction Date'].dt.month data['Day'] = data['Transaction Date'].dt.day

Memilih kolom yang relevan untuk model

X = data[['Year', 'Month', 'Day', 'Produk']] y = data['Penjualan_Real (kl)']

Label encoding untuk kolom 'Produk'

label_encoder = LabelEncoder() X['Produk'] = label_encoder.fit_transform(X['Produk'])

Membagi data menjadi data train dan test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Preprocessing untuk kolom numerikal (Year, Month, Day, Produk)

preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), ['Year', 'Month', 'Day', 'Produk']) ])

Membuat pipeline yang mencakup preprocessing dan model

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])

Melatih model

pipeline.fit(X_train, y_train)

Membuat prediksi

y_pred = pipeline.predict(X_test)

Membuat DataFrame untuk menampilkan hasil

results = X_test.copy() results['Penjualan real (Actual)'] = y_test.values results['Penjualan real (Predict)'] = y_pred

menampilkan hasil prediksi linier regresi

Assuming 'lr' is your fitted linear regression model and 'X_test' is available

pred = lr.predict(X_test) # Calculate predictions

Use Y_prediksi instead of y_pred

rmse = np.sqrt(mean_squared_error(y_test, pred)) print(f'Root Mean Squared Error: {rmse}')

data = pd.DataFrame({'Actual': y_test, 'Predicted': pred}) data lr=LinearRegression() lr.fit(X_train,y_train) pred=lr.predict(X_test)

dinarmalik37 commented 5 hours ago

print(lr.coef) print(lr.intercept) from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error import numpy as np

Misalkan y_test dan pred adalah array atau list dengan data yang sesuai

print('R² Score = ', r2_score(y_test, pred)) print('MAE = ', mean_absolute_error(y_test, pred)) # Call the function directly after importing print('RMSE = ', np.sqrt(mean_squared_error(y_test, pred))) import matplotlib.pyplot as plt

Assuming 'results' DataFrame from previous code contains actual and predicted values

actual = results['Penjualan real (Actual)'] predicted = results['Penjualan real (Predict)']

Create a scatter plot of actual vs predicted values

plt.figure(figsize=(10, 5)) plt.scatter(actual, predicted, color='blue', label='Predicted vs Actual') plt.plot(actual, actual, color='red', linewidth=2, label='Perfect Prediction Line') plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Actual vs Predicted Values') plt.legend() plt.grid(True) plt.show()

Data produk dengan nilai RMSE dan MAE

data = { "Produk": ["Bio Solar", "Pertalite", "Pertamax Turbo", "Pertamax", "Pertamina Dex"], "RMSE": [4.93, 3.56, 0.24, 1.26, 0.23], "MAE": [4.19, 2.34, 0.16, 0.86, 0.18] }

Mengonversi nilai decimal menjadi persentase

for i in range(len(data["Produk"])): data["RMSE"][i] = data["RMSE"][i] 100 data["MAE"][i] = data["MAE"][i] 100

Menampilkan data yang telah dikonversi

for i in range(len(data["Produk"])): print(f"{data['Produk'][i]} - RMSE: {data['RMSE'][i]:.2f}%, MAE: {data['MAE'][i]:.2f}%")

Given MAPE values

mape_values = [ 0.4407406724661195, 345576011805501.1, 102775132738909.39, 68609354818462.65, 73154835778935.28 ]

Total sum of the MAPE values

total_sum = sum(mape_values)

Calculate the percentage of each MAPE value relative to the total sum

percentages = [(value / total_sum) * 100 for value in mape_values] percentages

import pandas as pd

Assuming 'data' is your DataFrame

Check the available columns in your DataFrame

print(data.columns)

If 'Penjualan_Real (kl)' is not present,

replace it with the actual column name for your target variable

target_column = 'Actual' # Replace 'Actual' with the correct column name data_target = data[target_column].mean() print(f'Rata-rata nilai target: {data_target}')

Given values

RMSE = 1.7084453018220232e-15 average_target_value = 0.4677528089887641

Calculate the percentage error

percentage_error = (RMSE / average_target_value) * 100 percentage_error

Pastikan kolom 'Date' dalam format datetime

dates = pd.date_range(start='2022-01-01', end='2023-09-30', freq='M') data = np.random.rand(len(dates))*3500

membuat data frame

df = pd.DataFrame({ 'Date' : dates, 'Amount' : data })

df['Transaksi Data'] = pd.to_datetime(df['Transaksi Data']) #This line is removed because the column does not exist.

If you intend to convert the 'Date' column to datetime, use the line below instead:

df['Date'] = pd.to_datetime(df['Date'])

Filter data dari tahun 2022 hingga September 2023

start_date = '2022-01-01' end_date = '2023-09-30' filtered_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

Menghitung statistik dasar

total_transactions = filtered_df['Amount'].sum() average_per_month = filtered_df.groupby(filtered_df['Date'].dt.to_period('M'))['Amount'].mean()

print(f"Total Transactions: {total_transactions}") print("Rata-rata per Bulan:") print(average_per_month)

Plotting

plt.figure(figsize=(10, 6)) sns.scatterplot(x='Date', y='Amount', data=filtered_df)

sns.lineplot(x='Date', y='Prediction', data=df, label='Prediction', color='red') # Removed this line as 'Prediction' column doesn't exist

If you have predictions in a different DataFrame or variable, use that instead.

For example, if you have predictions in a variable named 'predictions':

sns.lineplot(x='Date', y=predictions, data=df, label='Prediction', color='red')

plt.title('Scatter Plot untuk Transaksi Penjualan Pertalite dari Tahun 2022 hingga September 2023') plt.xlabel('Date') plt.ylabel('Amount') plt.grid(True) plt.show() import matplotlib.pyplot as plt

Assuming 'data' from previous cells contains your actual and predicted values

Since 'data' is a NumPy array, access elements using numerical indices

actual = data # If 'data' contains only actual values

If 'data' contains both actual and predicted values, adjust indexing accordingly

For example, if actual values are in the first half and predicted in the second:

actual = data[:len(data)//2]

predicted = data[len(data)//2:]

Placeholder for predicted values - replace with your actual predicted values

predicted = np.random.rand(len(actual))

print(f"Total Transactions: {total_transactions}") print("Rata-rata per Bulan:") print(average_per_month)

Create a scatter plot of actual vs predicted values

plt.figure(figsize=(10, 5)) plt.scatter(actual, predicted, color='blue', label='Predicted vs Actual') plt.plot(actual, actual, color='red', linewidth=2, label='Perfect Prediction Line') plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Actual vs Predicted Values') plt.legend() plt.grid(True) plt.show()

average_prediction = pred.mean() print(f'Rata-rata nilai prediksi: {average_prediction}') import matplotlib.pyplot as plt import seaborn as sns

Menggunakan data dari hasil prediksi

results = X_test.copy() results['Penjualan real (Actual)'] = y_test.values results['Penjualan real (Predict)'] = y_pred

Membuat plot

plt.figure(figsize=(10, 6))

Plotting actual vs predicted

sns.scatterplot(x='Penjualan real (Actual)', y='Penjualan real (Predict)', data=results, label='Data Points')

Menambahkan garis regresi (ideal prediction line)

plt.plot([results['Penjualan real (Actual)'].min(), results['Penjualan real (Actual)'].max()], [results['Penjualan real (Actual)'].min(), results['Penjualan real (Actual)'].max()], color='red', lw=2, label='Ideal Prediction')

plt.title('Actual vs Predicted Penjualan Real') plt.xlabel('Penjualan Real (Actual)') plt.ylabel('Penjualan Real (Predicted)') plt.legend() plt.grid(True) plt.show() from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error import numpy as np

Misalkan y_test dan pred adalah array atau list dengan data yang sesuai

print('R² Score = ', r2_score(y_test, pred)) print('MAE = ', mean_absolute_error(y_test, pred)) # Call the function directly after importing print('RMSE = ', np.sqrt(mean_squared_error(y_test, pred))) import matplotlib.pyplot as plt

Assuming 'data' from previous cells contains your actual and predicted values

Since 'data' is a NumPy array, access elements using numerical indices

actual = data # If 'data' contains only actual values

If 'data' contains both actual and predicted values, adjust indexing accordingly

For example, if actual values are in the first half and predicted in the second:

actual = data[:len(data)//2]

predicted = data[len(data)//2:]

Placeholder for predicted values - replace with your actual predicted values

predicted = np.random.rand(len(actual))

Create a scatter plot of actual vs predicted values

plt.figure(figsize=(10, 5)) plt.scatter(actual, predicted, color='blue', label='Predicted vs Actual') plt.plot(actual, actual, color='red', linewidth=2, label='Perfect Prediction Line') plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Actual vs Predicted Values') plt.legend() plt.grid(True) plt.show()

Make sure 'pred' is defined correctly before calculating the mean

If 'pred' is not available, use 'predicted' instead

average_prediction = predicted.mean() print(f'Rata-rata nilai prediksi: {average_prediction}') from sklearn.preprocessing import MinMaxScaler import pandas as pd

Misalkan data sudah di-load ke dalam dataframe 'data'

data_numeric = data.drop(['Transaction Date'], axis=1, errors='ignore')

Hitung nilai minimum dan maksimum dari kolom yang relevan

min_values = data_numeric.min() max_values = data_numeric.max()

Buat scaler dengan parameter min dan max yang dihitung

scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit(data_numeric)

Normalisasi nilai 2.63 untuk setiap kolom numerik

Buat array 2D dengan nilai 2.63 untuk setiap kolom

input_values = [[17.84] * len(data_numeric.columns)] normalized_values = scaler.transform(input_values) print(normalized_values)

Nilai numerik

data = 3.5591632235523107

Mengubah menjadi persentase

percentage = data * 100

Membulatkan hasil

rounded_percentage = round(percentage)

print(f"{data} sebagai persentase adalah {rounded_percentage}%")

Menghitung jumlah dari data yang diberikan untuk memverifikasi apakah hasilnya sesuai dengan 1.746083705834967e-16

data = [ 4.927935266767451, 3.5591632235523107, 0.23995953264050024, 1.2564941305530912, 0.23080689494857123 ]

jumlah = sum(data) jumlah

import numpy as np

Data yang diberikan

values = np.array([4.93, 3.56, 0.24, 1.26, 0.23])

Hitung RMSE

rmse = np.sqrt(np.mean(values**2)) rmse

Data yang diberikan

values_mae = np.array([4.19, 2.34, 0.16, 0.86, 0.18])

Hitung MAE

mae = np.mean(np.abs(values_mae)) mae