Closed leopensaa closed 2 years ago
Advances on the data processing for this chart
# Reviews
reviews_df = pd.read_csv('datasets/order_reviews.csv')
orders_df = pd.read_csv('datasets/orders.csv')
customers_df = pd.read_csv('datasets/customers.csv')
geolocation_df = pd.read_csv('datasets/geolocation.csv')
df_comments = reviews_df.loc[:, ['review_score', 'review_comment_message']]
df_comments = df_comments.dropna(subset=['review_comment_message'])
df_comments = df_comments.reset_index(drop=True)
print(f'Dataset shape: {df_comments.shape}')
df_comments.columns = ['score', 'comment']
df_comments.head()
geolocation_df = geolocation_df.rename(columns={'geolocation_zip_code_prefix':'customer_zip_code_prefix'})
# create a new dataset with the customers geolocation_lat and geolocation_long
customers_geolocation = customers_df.merge(geolocation_df, how='left', on='customer_zip_code_prefix')
# clean the customer_df duplicates customer_unique_id
customers_geolocation = customers_geolocation.drop_duplicates(subset=['customer_unique_id'], keep='first')
# merge customers_geolocation and orders_df
customers_geolocation_orders = orders_df.merge(customers_geolocation, how='left', on='customer_id')
# mergue customers_geolocation_orders with reviews_df
customers__orders_reviews = customers_geolocation_orders.merge(reviews_df, how='left', on='order_id')
# map the review_score to a numerical value
score_map = {
1: 'negative',
2: 'negative',
3: 'positive',
4: 'positive',
5: 'positive'
}
customers__orders_reviews['sentiment_label'] = customers__orders_reviews['review_score'].map(score_map)
# drop customers__orders_reviews with zip_code_prefix = 0 or nan
customers__orders_reviews = customers__orders_reviews[customers__orders_reviews['customer_zip_code_prefix'] != 0]
customers__orders_reviews = customers__orders_reviews[customers__orders_reviews['customer_zip_code_prefix'].notnull()]
#drop nan reviews_score
customers__orders_reviews = customers__orders_reviews[customers__orders_reviews['review_score'].notnull()]
customers__orders_reviews
#keeping only the columns required for the graph
customers__orders_reviews = customers__orders_reviews[['order_status', 'customer_unique_id', 'customer_city',
'customer_state', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state',
'review_id', 'review_score', 'sentiment_label']]
customers__orders_reviews
visualization with px scatter_geo
# create a scatter_geo plot focused on brazil with the geolocation_lat and geolocation_long
fig = px.scatter_geo(customers__orders_reviews, lat="geolocation_lat", lon="geolocation_lng",
color="sentiment_label", color_discrete_map={'positive': 'green', 'negative': 'red'},
hover_name="customer_unique_id",
hover_data=["customer_city", "customer_state", "review_score"])
fig.update_layout(title_text="Reviews by customers")
fig.show()
i'm still working on a better way to visualize the second AC
Improvement of the graph, approaching with a heatmap to representate the accumulation of orders at the geolocation and using color gradients for the representative reviews.
# mapbox density heatmap of the reviews by customers with the geolocation_lat and geolocation_long and the review_score as the color
fig = px.density_mapbox(customers__orders_reviews, lat="geolocation_lat", lon="geolocation_lng",
z="review_score", radius=3, mapbox_style="open-street-map",
range_color=(0, 5),
color_continuous_scale = px.colors.diverging.RdYlGn
,zoom=2, center={"lat": -10.0, "lon": -51.0})
fig.update_layout(title_text="Reviews by customers")
fig.show()
now we have a trustworthy visualization for our goal.
Cool improvement! You almost have it @felipesaldata ⚡ !!
Second graph: Done
# group customers__orders_reviews by state aggregation of the count of reviews and review_score mean
customers__orders_reviews_state = customers__orders_reviews.groupby(['customer_state'])['review_score'].agg(['count', 'mean'])
#rename column to merge the coordinates
geolocation_df.rename(columns={'geolocation_state':'customer_state'}, inplace=True)
# add the cordinates of the state to the customers__orders_reviews_state
customers__orders_reviews_state = customers__orders_reviews_state.merge(geolocation_df, how='left', on='customer_state')
#drop duplicates
customers__orders_reviews_state = customers__orders_reviews_state.drop_duplicates(subset=['customer_state'], keep='first')
#keeping only the columns required for the graph
customers__orders_reviews_state = customers__orders_reviews_state[['customer_state','count','mean','geolocation_lat','geolocation_lng']]
#reset index
customers__orders_reviews_state = customers__orders_reviews_state.reset_index(drop=True)
#Some of the default values overlaps on the graph so we need to replace them with more propel values
#replace cordinates of RN with lat= -5.83 and lng= -35.21
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'RN', 'geolocation_lat'] = -5.833
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'RN', 'geolocation_lng'] = -35.21
#replace cordinates of AC with lat= -8.63 and lng= -69.78
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'AC', 'geolocation_lat'] = -8.63
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'AC', 'geolocation_lng'] = -69.78
#replace cordinates of GO with lat= -15.78 and lng= -50.69
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'GO', 'geolocation_lat'] = -15.78
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'GO', 'geolocation_lng'] = -50.69
#replaces cordinates of PA with lat= -5.93 and lng= -52.27
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'PA', 'geolocation_lat'] = -5.93
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'PA', 'geolocation_lng'] = -52.27
#replace cordinates of BA with lat= -12.29 and lng= -42.34
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'BA', 'geolocation_lat'] = -12.29
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'BA', 'geolocation_lng'] = -42.34
#replace cordinates of PE with lat= -8.59 and lng= -38.34
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'PE', 'geolocation_lat'] = -8.59
customers__orders_reviews_state.loc[customers__orders_reviews_state['customer_state'] == 'PE', 'geolocation_lng'] = -38.34
#plot bubble chart of customers__orders_reviews_state
fig = px.scatter_geo(customers__orders_reviews_state, lat="geolocation_lat", lon="geolocation_lng",
color="mean", size="count",size_max=50,
range_color=(2, 5),
color_continuous_scale = px.colors.diverging.RdYlGn,
hover_name="customer_state",
hover_data=["count", "mean"],
text = "count",
center={"lat": -10.0, "lon": -51.0},
scope="south america",
template="plotly_dark")
fig.update_layout(title_text="Reviews by customers")
fig.show()
Another approach for the second graph with choropleth
import json
# open brazil_states.json
with open('datasets/brazil-states.geojson') as json_file:
brazil_geo = json.load(json_file)
# creates a dict with the correspinding id with the sigla
state_id_map = {}
for state in brazil_geo['features']:
state_id_map[state['properties']['sigla']] = state['properties']['id']
# add regiao_id to the customers__orders_reviews_state
customers__orders_reviews_state['regiao_id'] = customers__orders_reviews_state['customer_state'].map(state_id_map)
# creates a choropleth map using the lat_lng column and state_id_map
fig = px.choropleth(customers__orders_reviews_state, geojson=brazil_geo, locations="customer_state",
color="mean",
color_continuous_scale=px.colors.diverging.RdYlGn,
hover_name="customer_state",
hover_data=["count", "mean"], featureidkey="properties.sigla",
scope="south america",
template="plotly_dark")
fig.update_geos(fitbounds="locations")
fig.show()
I'm wondering if the one with bubbles could be merged with the last one somehow. It would be great beacause we need just one of them for the dashboard. Thank you @felipesaldata. Closing this Issue.
💡 Goal
Show a map with a general ranking of customer satisfaction in Brazil to identify better and worse user experiences.
🤝 Acceptance Criteria
See the Notion reference with pictures as example