import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("data-no-id.csv", encoding='iso-8859-1')
df.columns = ['x', 'y', 'z']
df.head()
x | y | z | |
---|---|---|---|
0 | 0.02 | 0.00 | 0.00 |
1 | 0.04 | 0.00 | 0.20 |
2 | 0.06 | 0.03 | 0.33 |
3 | 0.00 | 0.03 | 0.33 |
4 | 0.02 | 0.06 | 0.00 |
df.columns
Index(['x', 'y', 'z'], dtype='object')
df['x'].value_counts()
0.02 8 0.06 7 0.04 5 0.51 5 0.00 3 0.13 3 0.09 3 0.17 2 0.11 2 0.28 2 0.53 2 0.55 1 0.62 1 0.36 1 0.15 1 0.91 1 0.87 1 0.72 1 0.19 1 0.21 1 0.98 1 1.00 1 Name: x, dtype: int64
df['y'].value_counts()
0.55 6 0.03 6 0.61 6 0.06 4 0.10 3 0.13 3 0.00 3 0.52 3 0.19 3 0.35 2 0.39 2 0.45 2 0.16 2 0.94 1 1.03 1 0.65 1 0.74 1 0.77 1 1.10 1 0.58 1 0.48 1 Name: y, dtype: int64
df['z'].value_counts()
0.13 9 0.07 8 0.00 6 0.40 5 1.00 5 0.73 4 0.93 2 0.20 2 0.33 2 0.67 2 0.47 2 0.60 2 0.87 1 0.53 1 0.27 1 0.80 1 Name: z, dtype: int64
import seaborn as sns
sns.distplot(df['x'], kde_kws={'clip': (0.0, 1.0)})
/usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='x', ylabel='Density'>
import seaborn as sns
sns.set_palette("dark")
sns.histplot(df['x'])
<AxesSubplot:xlabel='x', ylabel='Count'>
import seaborn as sns
sns.distplot(df['y'], kde_kws={'clip': (0.0, 1.0)})
/usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='y', ylabel='Density'>
import seaborn as sns
sns.histplot(df['y'])
<AxesSubplot:xlabel='y', ylabel='Count'>
import seaborn as sns
sns.distplot(df['z'], kde_kws={'clip': (0.0, 1.0)})
/usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='z', ylabel='Density'>
import seaborn as sns
sns.histplot(df['z'])
<AxesSubplot:xlabel='z', ylabel='Count'>
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
x, y, z = np.loadtxt('data-no-id.csv', delimiter=',', unpack=True)
ax.scatter(x,y,z)
plt.show()
The main goal here is to explore the next relationships:
#let's plot it
sns.distplot(df['x'], kde_kws={'clip': (0.0, 1.0)}, color="seagreen")
sns.distplot(df['y'], kde_kws={'clip': (0.0, 1.0)}, color="purple")
/usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='y', ylabel='Density'>
Comparing the 'X' and the 'Y', we gather the following insights:
sns.distplot(df['y'], kde_kws={'clip': (0.0, 1.0)}, color="seagreen")
sns.distplot(df['z'], kde_kws={'clip': (0.0, 1.0)}, color="purple")
/usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /usr/local/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='z', ylabel='Density'>
In case of comparing Y (how often do we ask gender pronouns) and Z(our general awareness about gender pronouns), we gathered the following insights:
Our hypotheses are:
plt.style.use("ggplot")
fig, axs = plt.subplots(2,2, figsize = (25,15))
fig.suptitle("How often do we ask about gender profiles", fontsize = 20)
sns.regplot(y = 'y', x = 'x', data = df, ax = axs[0, 0])
sns.regplot(y = 'y', x = 'z', data = df, ax = axs[0, 1])
axs[1,2].set_xticklabels(axs[1,2].get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-125-9c8ebf331a15> in <module> 5 sns.regplot(y = 'y', x = 'x', data = df, ax = axs[0, 0]) 6 sns.regplot(y = 'y', x = 'z', data = df, ax = axs[0, 1]) ----> 7 axs[1,2].set_xticklabels(axs[1,2].get_xticklabels(), rotation = 45, horizontalalignment = 'right') 8 9 plt.show() IndexError: index 2 is out of bounds for axis 1 with size 2
From the 2 graphs we can conclude the following information:
corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(225, 7, as_cmap=True)
fig, axs = plt.subplots(1,1, figsize = (15,15))
sns.heatmap(corr, mask = mask, cmap = cmap)
<ipython-input-126-8497c6b7606c>:2: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations mask = np.zeros_like(corr, dtype=np.bool)
<AxesSubplot:>
Finally, to confirm all of the above, we've plotted the correlation matrix between all of the varibale and found: