import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
data=pd.read_csv("SampleSuperstore.csv")
data
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')
data.isnull().sum()
data.duplicated().sum()
data.nunique()
col=['Postal Code']
data1=data.drop(columns=col,axis=1)
data1.corr()
data1.cov()
plt.figure(figsize=(16,8))
plt.bar('Sub-Category','Category', data=data1)
plt.title('Category vs Sub Category')
plt.xlabel('Sub-Catgory')
plt.ylabel('Category')
plt.xticks(rotation=45)
plt.show()
data1.hist(bins=50 ,figsize=(20,15))
plt.show();
data1['State'].value_counts()
plt.figure(figsize=(15,15))
sns.countplot(x=data1['State'])
plt.xticks(rotation=90)
plt.title("STATE")
plt.show()
sns.set(style="whitegrid")
plt.figure(2, figsize=(20,15))
sns.barplot(x='Sub-Category',y='Profit', data=data, palette='Spectral')
plt.suptitle('Pie Consumption Patterns in the United States', fontsize=16)
plt.show()
figsize=(15,10)
sns.pairplot(data1,hue='Sub-Category')
plt.show
plt.figure(figsize=(10,4))
sns.lineplot('Discount','Profit', data=data1 , color='y',label='Discount')
plt.legend()
plt.show()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
state_code = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','District of Columbia': 'WA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
data1['state_code'] = data1.State.apply(lambda x: state_code[x])
state_data = data1[['Sales', 'Profit', 'state_code']].groupby(['state_code']).sum()
fig = go.Figure(data=go.Choropleth(
locations=state_data.index,
z = state_data.Sales,
locationmode = 'USA-states',
colorscale = 'Reds',
colorbar_title = 'Sales in USD',
))
fig.update_layout(
title_text = 'Total State-Wise Sales',
geo_scope='usa',
height=800,
)
fig.show()
def state_data_viewer(states):
"""Plots the turnover generated by different product categories and sub-categories for the list of given states.
Args:
states- List of all the states you want the plots for
Returns:
None
"""
product_data = data1.groupby(['State'])
for state in states:
data = product_data.get_group(state).groupby(['Category'])
fig, ax = plt.subplots(1, 3, figsize = (28,5))
fig.suptitle(state, fontsize=14)
ax_index = 0
for cat in ['Furniture', 'Office Supplies', 'Technology']:
cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
sns.barplot(x = cat_data.Profit, y = cat_data.index, ax = ax[ax_index])
ax[ax_index].set_ylabel(cat)
ax_index +=1
fig.show()
states = ['California', 'Washington', 'Mississippi', 'Arizona', 'Texas']
state_data_viewer(states)
x = data.iloc[:, [9, 10, 11, 12]].values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++',
max_iter = 300, n_init = 10, random_state = 0).fit(x)
wcss.append(kmeans.inertia_)
sns.set_style("whitegrid")
sns.FacetGrid(data, hue ="Sub-Category",height = 6).map(plt.scatter,'Sales','Quantity')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1],
s = 100, c = 'yellow', label = 'Centroids')
plt.legend()
plt.show()
sns.set_style("whitegrid")
sns.FacetGrid(data, hue ="Sub-Category",height = 6).map(plt.scatter,'Sales','Profit')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1],
s = 100, c = 'yellow', label = 'Centroids')
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize = (10 , 6))
ax.scatter(data1["Sales"] , data1["Profit"])
ax.set_xlabel('Sales')
ax.set_ylabel('Profit')
ax.set_title('Sales vs Profit')
plt.show()