import pandas as pd


# Import institution-level data for postsecondary education
inst_level = pd.read_csv('data/Most-Recent-Cohorts-Institution.csv', low_memory=False)


print('Before dropping: ' + str(inst_level['UNITID'].count()))
# Filter entries to currently operational institutions
inst_level = inst_level.loc[inst_level['CURROPER'] == 1]

# Filter entries to graduate-degree offering institutions
inst_level = inst_level.loc[inst_level['HIGHDEG'] == 4]

# Filter entries to 4-year institutions
inst_level = inst_level.loc[inst_level['ICLEVEL'] == 1]
print('After dropping: ' + str(inst_level['UNITID'].count()))

Before dropping: 6662
After dropping: 1957


# Permanently drop any institution that doesn't have average net price or median earnings data.
print('Before dropping: ' + str(inst_level['UNITID'].count()))
inst_level = inst_level.dropna(subset='MD_EARN_WNE_P10')
inst_level = inst_level.dropna(subset=['NPT4_PUB', 'NPT4_PRIV'], thresh=1)
print('After dropping: ' + str(inst_level['UNITID'].count()))

# Add an empty IEE column to the dataset
inst_level['IEE'] = 0.0

# Calculate IEE for each institution and add it to the data entry.
for inst in inst_level.iterrows():
    if inst[1]['CONTROL'] == 1:
        inst_level.at[inst[0], 'IEE'] = float(4*inst[1]['NPT4_PUB'] / inst[1]['MD_EARN_WNE_P10'])
    else:
        inst_level.at[inst[0], 'IEE'] = float(4*inst[1]['NPT4_PRIV'] / inst[1]['MD_EARN_WNE_P10'])

Before dropping: 1957
After dropping: 1466


import matplotlib.pyplot as plt


# Aggregate data for each locale
inst_locales = [[],[],[],[]] # array with an array for each locale
for inst in inst_level.iterrows():
    # City locale
    if inst[1]['LOCALE'] >= 11 and inst[1]['LOCALE'] <= 13:
        inst_locales[0].append(inst[1]['IEE'])

    # Suburb locale
    elif inst[1]['LOCALE'] >= 21 and inst[1]['LOCALE'] <= 23:
        inst_locales[1].append(inst[1]['IEE'])

    # Town locale
    elif inst[1]['LOCALE'] >= 31 and inst[1]['LOCALE'] <= 33:
        inst_locales[2].append(inst[1]['IEE'])

    # Rural locale
    else: # inst[1]['LOCALE'] >= 41 and inst[1]['LOCALE'] <= 43
        inst_locales[3].append(inst[1]['IEE'])


# Set up plot
plt.figure()
plt.title('IEE by Locale')
plt.xlabel('Institution Locale')
plt.ylabel('Investment-Earnings Equivalent')

# Set up locale labels
locales = ['City', 'Suburb', 'Town', 'Rural']
locales_pos = range(1,5)
plt.xticks(locales_pos, locales)

plt.violinplot(inst_locales, locales_pos, showmedians=True)
print()


# Number of institutions per locale
print('City institutions: ' + str(len(inst_locales[0])))
print('Suburb institutions: ' + str(len(inst_locales[1])))
print('Town institutions: ' + str(len(inst_locales[2])))
print('Rural institutions: ' + str(len(inst_locales[3])))

City institutions: 758
Suburb institutions: 362
Town institutions: 272
Rural institutions: 74


# Aggregate data for each control type
inst_controls = [[],[],[]] # array with an array for each locale
for inst in inst_level.iterrows():
    # Public control
    if inst[1]['CONTROL'] == 1:
        inst_controls[0].append(inst[1]['IEE'])

    # Private Nonprofit control
    elif inst[1]['CONTROL'] == 2:
        inst_controls[1].append(inst[1]['IEE'])

    # Private For-profit control
    else: # inst[1]['CONTROL'] == 3
        inst_controls[2].append(inst[1]['IEE'])


# Set up plot
plt.figure()
plt.title('IEE by Control')
plt.xlabel('Institution Locale')
plt.ylabel('Investment-Earnings Equivalent')

# Set up control labels
controls = ['Public','Private Nonprofit', 'Private For-Profit']
controls_pos = range(1,4)
plt.xticks(controls_pos, controls)

plt.violinplot(inst_controls, controls_pos, showmedians=True)
print()


# Number of institutions per control
print('Public institutions: ' + str(len(inst_controls[0])))
print('Private Nonprofit institutions: ' + str(len(inst_controls[1])))
print('Private For-Profit institutions: ' + str(len(inst_controls[2])))

Public institutions: 510
Private Nonprofit institutions: 857
Private For-Profit institutions: 99


# Set up plot
plt.figure()
plt.title('IEE vs. Admission Rate')
plt.xlabel('Admission Rate')
plt.ylabel('Investment-Earnings Equivalent')

plt.scatter(inst_level['ADM_RATE'], inst_level['IEE'])

<matplotlib.collections.PathCollection at 0x7f6aad147e50>


# Set up plot
plt.figure()
plt.title('IEE vs. Enrollment')
plt.xlabel('Enrollment')
plt.ylabel('Investment-Earnings Equivalent')

plt.scatter(inst_level['UGDS'], inst_level['IEE'])

<matplotlib.collections.PathCollection at 0x7f6aad1470d0>


import statistics as stat
import numpy as np


# Permanently drop any institution that does not have admissions data
print('Before dropping: ' + str(inst_level['UNITID'].count()))
inst_level = inst_level.dropna(subset='ADM_RATE')
print('After dropping: ' + str(inst_level['UNITID'].count()))

# Permanently homogenize public nonprofit and for-profit institutions
for inst in inst_level.iterrows():
    if inst[1]['CONTROL'] == 3:
        inst_level.at[inst[0], 'CONTROL'] = 2

# Collect the IEE and admission rates of public and private institutions in 2 different pairs of parallel arrays.
public_IEE = []
public_ADM = []
private_IEE = []
private_ADM = []
for inst in inst_level.iterrows():
    if inst[1]['CONTROL'] == 1:
        public_IEE.append(inst[1]['IEE'])
        public_ADM.append(inst[1]['ADM_RATE'])
    else:
        private_IEE.append(inst[1]['IEE'])
        private_ADM.append(inst[1]['ADM_RATE'])

Before dropping: 1466
After dropping: 1291


plt.figure(figsize=(12,4))

# Set up public institution plot
ax1 = plt.subplot(1,2,1)
ax1.scatter(public_ADM, public_IEE, color='blue')
ax1.axhline(stat.median(public_IEE), color='green', linewidth=3)
ax1.text(1, stat.median(public_IEE) + 0.1, str(round(stat.median(public_IEE), 2)), color='green', size=12)
ax1.set_title('IEE vs. Admission Rate (Public)')
ax1.set_xlabel('Admission Rate')
ax1.set_ylabel('Investment-Earnings Equivalent')
ax1.set_ylim([0, 6.5])
ax1.set_xlim([0, 1])

# Set up private institution plot
ax2 = plt.subplot(1,2,2)
ax2.scatter(private_ADM, private_IEE, color='red')
ax2.axhline(stat.median(private_IEE), color='green', linewidth=3)
ax2.text(1, stat.median(private_IEE) + 0.1, str(round(stat.median(private_IEE), 2)), color='green', size=12)
ax2.set_title('IEE vs. Admission Rate (Private)')
ax2.set_xlabel('Admission Rate')
ax2.set_ylabel('Investment-Earnings Equivalent')
ax2.set_ylim([0, 6.5])
ax2.set_xlim([0, 1])

# Set up all institution plot
plt.figure(figsize=(12,8))
plt.title('IEE vs. Admission Rate (All)')
plt.xlabel('Admission Rate')
plt.ylabel('Investment-Earnings Equivalent')
plt.ylim([0, 6.5])
plt.xlim([0, 1])
plt.scatter(public_ADM, public_IEE, color='blue')
plt.scatter(private_ADM, private_IEE, color='red')
plt.axhline(stat.median(public_IEE + private_IEE), color='green', linewidth=5)
plt.text(1, stat.median(public_IEE + private_IEE) + 0.1, str(round(stat.median(public_IEE + private_IEE), 2)), color='green', size=15)
print() # Otherwise prints nonsense


Q3 = np.quantile(inst_level['IEE'], 0.75)
Q1 = np.quantile(inst_level['IEE'], 0.25)
IQR = Q3 - Q1

print('Before removing outliers: ' + str(inst_level['IEE'].count()))
inst_level = inst_level.loc[inst_level['IEE'] < (Q3 + 1.5 * IQR)]
inst_level = inst_level.loc[inst_level['IEE'] > (Q1 - 1.5 * IQR)]
print('After removing outliers: ' + str(inst_level['IEE'].count()))
print('Non-outlier Range: (' + str(Q1 - 1.5 * IQR) + ' , ' + str(Q3 + 1.5 * IQR) + ')')

Before removing outliers: 1291
After removing outliers: 1252
Non-outlier Range: (0.16109890135194171 , 3.1058881998810275)


# Recollect the IEE and admission rates without the outliers.
public_IEE = []
public_ADM = []
private_IEE = []
private_ADM = []
for inst in inst_level.iterrows():
    if inst[1]['CONTROL'] == 1:
        public_IEE.append(inst[1]['IEE'])
        public_ADM.append(inst[1]['ADM_RATE'])
    else:
        private_IEE.append(inst[1]['IEE'])
        private_ADM.append(inst[1]['ADM_RATE'])


plt.figure(figsize=(12,4))

# Set up public institution plot
ax1 = plt.subplot(1,2,1)
ax1.scatter(public_ADM, public_IEE, color='blue')
ax1.axhline(stat.median(public_IEE), color='green', linewidth=3)
ax1.text(1, stat.median(public_IEE) + 0.1, str(round(stat.median(public_IEE), 2)), color='green', size=12)
ax1.set_title('IEE vs. Admission Rate (Public)')
ax1.set_xlabel('Admission Rate')
ax1.set_ylabel('Investment-Earnings Equivalent')
ax1.set_ylim([0, 3.25])
ax1.set_xlim([0, 1])

# Set up private institution plot
ax2 = plt.subplot(1,2,2)
ax2.scatter(private_ADM, private_IEE, color='red')
ax2.axhline(stat.median(private_IEE), color='green', linewidth=3)
ax2.text(1, stat.median(private_IEE) + 0.1, str(round(stat.median(private_IEE), 2)), color='green', size=12)
ax2.set_title('IEE vs. Admission Rate (Private)')
ax2.set_xlabel('Admission Rate')
ax2.set_ylabel('Investment-Earnings Equivalent')
ax2.set_ylim([0, 3.25])
ax2.set_xlim([0, 1])

# Set up all institution plot
plt.figure(figsize=(12,8))
plt.title('IEE vs. Admission Rate (All)')
plt.xlabel('Admission Rate')
plt.ylabel('Investment-Earnings Equivalent')
plt.ylim([0, 3.25])
plt.xlim([0, 1])
plt.scatter(public_ADM, public_IEE, color='blue')
plt.scatter(private_ADM, private_IEE, color='red')
plt.axhline(stat.median(public_IEE + private_IEE), color='green', linewidth=5)
plt.text(1, stat.median(public_IEE + private_IEE) + 0.1, str(round(stat.median(public_IEE + private_IEE), 2)), color='green', size=15)
print()


import statsmodels.api as sm

/opt/conda/lib/python3.9/site-packages/statsmodels/compat/pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import Int64Index as NumericIndex


lm_est = sm.OLS(inst_level['IEE'], sm.add_constant(inst_level['ADM_RATE'])).fit()
print(lm_est.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    IEE   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.7314
Date:                Mon, 16 May 2022   Prob (F-statistic):              0.393
Time:                        22:25:25   Log-Likelihood:                -975.55
No. Observations:                1252   AIC:                             1955.
Df Residuals:                    1250   BIC:                             1965.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.5831      0.056     28.232      0.000       1.473       1.693
ADM_RATE       0.0649      0.076      0.855      0.393      -0.084       0.214
==============================================================================
Omnibus:                        7.036   Durbin-Watson:                   1.551
Prob(Omnibus):                  0.030   Jarque-Bera (JB):                7.140
Skew:                           0.181   Prob(JB):                       0.0282
Kurtosis:                       2.927   Cond. No.                         7.73
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Reformat data for use in an interaction model.
IEEs = []
ADM_controls = []

for (iee, adm) in zip(public_IEE, public_ADM):
    IEEs.append(iee) 
    ADM_controls.append([adm, 0]) # 0 represents public school

for (iee, adm) in zip(private_IEE, private_ADM):
    IEEs.append(iee) 
    ADM_controls.append([adm, 1]) # 1 represents private school


# Fit interaction model using statsmodels. 
lm_int_est = sm.OLS(IEEs, sm.add_constant(ADM_controls)).fit()
print(lm_int_est.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.391
Model:                            OLS   Adj. R-squared:                  0.390
Method:                 Least Squares   F-statistic:                     400.2
Date:                Mon, 16 May 2022   Prob (F-statistic):          4.93e-135
Time:                        22:25:26   Log-Likelihood:                -665.92
No. Observations:                1252   AIC:                             1338.
Df Residuals:                    1249   BIC:                             1353.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9296      0.050     18.768      0.000       0.832       1.027
x1             0.3774      0.060      6.260      0.000       0.259       0.496
x2             0.6915      0.024     28.270      0.000       0.644       0.740
==============================================================================
Omnibus:                       18.507   Durbin-Watson:                   1.574
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               23.204
Skew:                           0.198   Prob(JB):                     9.15e-06
Kurtosis:                       3.536   Cond. No.                         9.22
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


from sklearn.preprocessing import PolynomialFeatures


poly = PolynomialFeatures(degree=2)
xp = poly.fit_transform(np.array(public_ADM + private_ADM).reshape(-1, 1))
pm_est = sm.OLS(IEEs, xp).fit()
print(pm_est.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     6.027
Date:                Mon, 16 May 2022   Prob (F-statistic):            0.00248
Time:                        22:25:26   Log-Likelihood:                -969.91
No. Observations:                1252   AIC:                             1946.
Df Residuals:                    1249   BIC:                             1961.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.2819      0.106     12.146      0.000       1.075       1.489
x1             1.2082      0.348      3.470      0.001       0.525       1.891
x2            -0.9397      0.279     -3.364      0.001      -1.488      -0.392
==============================================================================
Omnibus:                        8.921   Durbin-Watson:                   0.965
Prob(Omnibus):                  0.012   Jarque-Bera (JB):                8.968
Skew:                           0.207   Prob(JB):                       0.0113
Kurtosis:                       3.015   Cond. No.                         41.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


poly = PolynomialFeatures(degree=2)
xp = poly.fit_transform(ADM_controls)
pm_est = sm.OLS(IEEs, xp).fit()
print(pm_est.pvalues)
print(pm_est.summary())

[3.01444799e-02 3.60364220e-11 9.90189884e-23 2.67738558e-07
 3.70922374e-04 9.90189884e-23]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.406
Model:                            OLS   Adj. R-squared:                  0.404
Method:                 Least Squares   F-statistic:                     212.8
Date:                Mon, 16 May 2022   Prob (F-statistic):          3.23e-139
Time:                        22:25:26   Log-Likelihood:                -650.19
No. Observations:                1252   AIC:                             1310.
Df Residuals:                    1247   BIC:                             1336.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2802      0.129      2.171      0.030       0.027       0.533
x1             2.1590      0.323      6.679      0.000       1.525       2.793
x2             0.5298      0.053     10.008      0.000       0.426       0.634
x3            -1.1656      0.225     -5.173      0.000      -1.608      -0.724
x4            -0.4981      0.140     -3.570      0.000      -0.772      -0.224
x5             0.5298      0.053     10.008      0.000       0.426       0.634
==============================================================================
Omnibus:                       29.266   Durbin-Watson:                   1.580
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               37.000
Skew:                           0.281   Prob(JB):                     9.24e-09
Kurtosis:                       3.626   Cond. No.                     2.57e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.71e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


from sklearn.linear_model import LinearRegression


# Set up all institution plot
plt.figure(figsize=(12,8))
plt.title('IEE vs. Admission Rate (All)')
plt.xlabel('Admission Rate')
plt.ylabel('Investment-Earnings Equivalent')
plt.ylim([0, 3])
plt.xlim([0, 1])
plt.scatter(public_ADM, public_IEE, color='blue')
plt.scatter(private_ADM, private_IEE, color='red')

# Fit and plot linear regression for overall data
lm = LinearRegression().fit(np.array(public_ADM + private_ADM).reshape(-1,1), np.array(IEEs).reshape(-1,1))
print(lm.score(np.array(public_ADM + private_ADM).reshape(-1,1), np.array(IEEs).reshape(-1,1)))
plt.plot(np.linspace(0,1,100), lm.predict(np.linspace(0,1,100).reshape(-1,1)), color='black', linewidth=5)

# Fit and plot linear regression lines for each control type based on the interaction
lm = LinearRegression()
lm.fit(ADM_controls, IEEs)
print(lm.coef_)
print(lm.score(ADM_controls, IEEs))
Xs_pub = []
Xs_priv = []
for x in np.linspace(0, 1, 100):
    Xs_pub.append([x, 0])
    Xs_priv.append([x, 1])
plt.plot(Xs_pub, lm.predict(Xs_pub), color='springgreen', linewidth=5)
plt.plot(Xs_priv, lm.predict(Xs_priv), color='aqua', linewidth=5)

# Fit and plot polynomial regression lines for overall data
pm = np.poly1d(np.polyfit(public_ADM + private_ADM, IEEs, 2))
polyline = np.linspace(0, 1, 100)
plt.plot(polyline, pm(polyline), linewidth=5, color = 'darkgreen')

0.000584766704797568
[0.37741721 0.6915376 ]
0.390552273905726

[<matplotlib.lines.Line2D at 0x7f6aa400eca0>]

Valuable Institutions

Valuable Institutions: What Makes a University a Better Investment?¶

Introduction¶

Data Curation and Management ¶

Data Sourcing¶

Data Curation¶

Key Attributes¶

Data Management¶

Exploratory Data Analysis¶

IEE vs. Institution Locale¶

IEE by Institution Control¶

IEE vs. Institution Admission Rate¶

IEE vs. Enrollment¶

Hypothesis Testing¶

Visualizing Hypothesis Variables¶

Statistical Modeling & Hypothesis Testing¶

Simple Linear Regression¶

Linear Regression with Interaction Term¶

Simple Polynomial Regression¶

Polynomial Regression with Interaction Term¶

Insight Analysis¶

Visualizing Our Statistical Models¶

Analyzing Our Models¶

Low P-Value and Low $R^2$ Value?¶

Conclusion and Further Investigation¶