First pass

Fill in a module description here

YouTubeVideo("jHXTzL5tyG4", width=1920 / 4, height=1080/4)

Imports

source

foo

 foo ()

import datetime
import hashlib
import json
import os
import re
import sys
import time
import warnings

import ipywidgets as widgets
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from matplotlib.ticker import FuncFormatter
from pandas.plotting import register_matplotlib_converters
from scipy.stats import norm

register_matplotlib_converters()
import seaborn as sns
from IPython.display import Markdown, display

sns.set()
sns.set_context("poster", font_scale=1.3)
plt.rcParams["figure.figsize"] = 10, 6
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.precision = 4
warnings.simplefilter(action="ignore", category=FutureWarning)

from ydata_profiling import ProfileReport

Functions

def raw_to_clean(raw):
    return (
        raw.assign(Major=lambda x: x.Major.str.title())
    )

def plot_top_n_salaries(df, top_n):
    df_sorted = df.nlargest(top_n, 'Median')
    df_plot = df_sorted[['Major', 'P25th', 'Median', 'P75th']]
    df_melted = df_plot.melt(id_vars='Major', value_vars=['P25th', 'Median', 'P75th'])

    fig, ax = plt.subplots(figsize=(20, 8))
    fig.patch.set_facecolor('w')

    sns.barplot(x='value', y='Major', hue='variable', data=df_melted, ax=ax)
    ax.set_title('P25, Median, and P75 Salaries for Top {} Majors'.format(top_n))
    ax.set_xlabel('Salary')
    ax.set_ylabel('Major')
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    fig.tight_layout()

Data

raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv")

Cleaning

df = raw_to_clean(raw)

df.head()

	Rank	Major_code	Major	Total	Men	Women	Major_category	ShareWomen	Sample_size	Employed	Full_time	Part_time	Full_time_year_round	Unemployed	Unemployment_rate	Median	P25th	P75th	College_jobs	Non_college_jobs	Low_wage_jobs
0	1	2419	Petroleum Engineering	2339.0	2057.0	282.0	Engineering	0.1206	36	1976	1849	270	1207	37	0.0184	110000	95000	125000	1534	364	193
1	2	2416	Mining And Mineral Engineering	756.0	679.0	77.0	Engineering	0.1019	7	640	556	170	388	85	0.1172	75000	55000	90000	350	257	50
2	3	2415	Metallurgical Engineering	856.0	725.0	131.0	Engineering	0.1530	3	648	558	133	340	16	0.0241	73000	50000	105000	456	176	0
3	4	2417	Naval Architecture And Marine Engineering	1258.0	1123.0	135.0	Engineering	0.1073	16	758	1069	150	692	40	0.0501	70000	43000	80000	529	102	0
4	5	2405	Chemical Engineering	32260.0	21239.0	11021.0	Engineering	0.3416	289	25694	23170	5180	16697	1672	0.0611	65000	50000	75000	18314	4440	972

Data Visualization Plots

Popularity of each Major Category

fig, ax = plt.subplots(figsize=(16, 12))
fig.patch.set_facecolor('w')

major_popularity = df.groupby('Major_category')['Total'].sum().reset_index()
major_popularity = major_popularity.sort_values(by='Total', ascending=False)

sns.barplot(x='Total', y='Major_category', data=major_popularity, ax=ax)
ax.set_title('Popularity of Each Major Category')
ax.set_xlabel('Total Number of Students')
ax.set_ylabel('Major Category')

fig.tight_layout()

Top N Median salaries

plot_top_n_salaries(df, 5)

plt.hist(df['Median'], bins=10, edgecolor='black')
plt.title('Distribution of Median Salaries')
plt.xlabel('Median Salary')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()

Ydata Profile

profile = ProfileReport(df, title='College Data', config_file="/Users/jonathan/Downloads/config_minimal.yaml")

profile

<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string

Imports

foo

Functions

Data

Cleaning

Data Visualization Plots

Popularity of each Major Category

Top N Median salaries

Ydata Profile

Cleanup