import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Uncomment this line to pull in the dataset ###
#!wget https://www.dataquest.io/blog/large_files/fortune500.csv

f500_df = pd.read_csv('data/fortune500.csv')
f500_df.head()

f500_df.dtypes

Year                       int64
Rank                       int64
Company                   object
Revenue (in millions)    float64
Profit (in millions)      object
dtype: object

# replace string 'N.A.' with numpy's nan value to correct dtype of Profit column
f500_df['Profit (in millions)'] = f500_df['Profit (in millions)'].replace('N.A.',np.nan).astype(np.float)
f500_df.dtypes

Year                       int64
Rank                       int64
Company                   object
Revenue (in millions)    float64
Profit (in millions)     float64
dtype: object

f500_df.Company.nunique()

1887

(f500_df.Company.value_counts() > 50).sum()

74

f500_df.Company.value_counts()[:10]

CBS                        57
OfficeMax                  55
Motorola                   51
Campbell Soup              51
Rohm & Haas                51
Procter & Gamble           51
Sunoco                     51
Kellogg                    51
Intl. Business Machines    51
Exxon Mobil                51
Name: Company, dtype: int64

# Years 1972-1981 and 1995-2003 have duplicate values
(f500_df[f500_df.Company.isin(['CBS','OfficeMax'])]
 .groupby(['Year','Company'])['Company','Year']
 .count()).query('Company > 1')

# Choose 3 companies for plotting
comp3_df = f500_df[f500_df.Company.isin(['Kellogg','3M','Pfizer'])].copy()

# Rename the columns for easier pandas access
comp3_df.rename(columns={'Revenue (in millions)':'Revenue_m', 'Profit (in millions)':'Profit_m'}, inplace=True)

comp3_df.sort_values(['Year','Rank'])[:9]

plt.style.use('seaborn')
fig, ax = plt.subplots(figsize=(10,6))
ax.plot('Year','Rank',data=comp3_df[comp3_df.Company=='3M'],c='red', linestyle='solid',label='3M')
ax.plot('Year','Rank',data=comp3_df[comp3_df.Company=='Kellogg'],c='orange', linestyle='dotted',label='Kellogg')
ax.plot('Year','Rank',data=comp3_df[comp3_df.Company=='Pfizer'],c='blue', linestyle='dashed',label='Pfizer')
ax.axvline(1994)
ax.set_xlabel('Year', fontdict={'fontsize':'14'})
ax.set_xticks(np.arange(1955,2006,5))

ax.set_ylabel('Rank',fontdict={'fontsize':'14'})
ax.set_yticks(np.arange(0,300,30))
ax.invert_yaxis()

ax.legend(fontsize='large',loc='best')
fig.suptitle('Fortune 500 Rank')
ax.set_title('3M, Kellogg, and Pfizer (1955-2005)');

f93_96 = f500_df[f500_df.Year.isin(range(1993,1997))][['Year','Company','Rank']]

f36_piv = f93_96.pivot_table(values=['Rank'],index=['Company'],columns=['Year']).Rank

f36_piv.fillna(501).diff(axis=1)[[1994,1995]].sort_values([1995])[:10]

f36_piv[f36_piv[1994].isna() & ~f36_piv[1995].isna()].shape

(288, 4)

f5s = f500_df.pivot_table(values=['Rank'],index=['Year'],columns=['Company']).Rank

year_diff = pd.Series({i+1:len(set(f5s.loc[i].dropna().index) - set(f5s.loc[i+1].dropna().index)) for i in f5s.index if i < 2005})

year_diff.plot(xticks=np.arange(1955,2006,5), 
               label=(plt.xlabel('Year'),plt.ylabel('$\Delta$ Companies')), 
               title="Difference in companies compared to previous year");

	Year	Rank	Company	Revenue (in millions)	Profit (in millions)
0	1955	1	General Motors	9823.5	806
1	1955	2	Exxon Mobil	5661.4	584.8
2	1955	3	U.S. Steel	3250.4	195.4
3	1955	4	General Electric	2959.1	212.6
4	1955	5	Esmark	2510.8	19.1

	Year	Rank	Company	Revenue_m	Profit_m
130	1955	131	3M	230.9	24.6
185	1955	186	Kellogg	169.5	12.7
214	1955	215	Pfizer	145.2	15.2
624	1956	125	3M	281.9	34.7
689	1956	190	Kellogg	182.1	13.8
706	1956	207	Pfizer	163.8	15.3
1115	1957	116	3M	330.8	38.7
1195	1957	196	Kellogg	201.7	15.1
1217	1957	218	Pfizer	178.4	18.3

Year	1994	1995
Company
Wal-Mart Stores	0.0	-497.0
AT&T	0.0	-496.0
Sears Roebuck	0.0	-492.0
State Farm Insurance Cos	0.0	-489.0
Prudential Ins. Co. of America	0.0	-488.0
Kmart Holding	0.0	-486.0
Citicorp	0.0	-484.0
ITT Industries	0.0	-478.0
Kroger	0.0	-476.0
American Intl. Group	0.0	-475.0

Introduction: Fortune 500 Companies

Overview¶

Imports¶

Data¶

Exploratory Data Analysis¶

Conclusions¶

Future Work¶

		Company	Year
Year	Company
1972	CBS	2	2
1973	CBS	2	2
1974	CBS	2	2
1975	CBS	2	2
1976	CBS	2	2
1977	CBS	2	2
1978	CBS	2	2
1979	CBS	2	2
1980	CBS	2	2
1981	CBS	2	2
1995	CBS	2	2
1996	OfficeMax	2	2
1997	OfficeMax	2	2
1998	OfficeMax	2	2
1999	OfficeMax	2	2
2000	OfficeMax	2	2
2001	OfficeMax	2	2
2002	OfficeMax	2	2
2003	OfficeMax	2	2