# create newproject database

import psycopg2
from psycopg2.extras import RealDictCursor
import csv

import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

conn = psycopg2.connect(host='localhost');

conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);
cursor= conn.cursor();

cursor.execute(
    '''
    CREATE DATABASE newproject
    '''
)

cursor.close()
conn.close()


# create tables

with psycopg2.connect(host='localhost', dbname='newproject') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute(
            '''
            DROP TABLE IF EXISTS states;
            CREATE TABLE states (
                id SERIAL PRIMARY KEY,
                label TEXT,
                census_region_name TEXT,
                census_division_name TEXT
            );
            
            DROP TABLE IF EXISTS salaries;
            CREATE TABLE salaries (
                id SERIAL PRIMARY KEY,
                name TEXT,
                annual_salary INTEGER,
                state_id INTEGER
            );
            
            DROP TABLE IF EXISTS populations;
            CREATE TABLE populations (
                id SERIAL PRIMARY KEY,
                name TEXT,
                population_num INTEGER,
                growth FLOAT,
                state_id INTEGER
            );
            
            DROP TABLE IF EXISTS colleges;
            CREATE TABLE colleges (
                id SERIAL PRIMARY KEY,
                name TEXT,
                student_pop INTEGER,
                graduation_rate FLOAT,
                state_id INTEGER
            );
            
            DROP TABLE IF EXISTS state_names;
            CREATE TABLE state_names (
                id SERIAL PRIMARY KEY,
                name TEXT,
                state_id INTEGER    
            );
            '''
        )


import psycopg2
from psycopg2.extras import RealDictCursor
import csv


# insert states.csv data into the states table and create many statename in the state_names table

with psycopg2.connect(host='localhost', dbname='newproject') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        state_sql = """
             INSERT INTO states(
                             label,
                             census_region_name,
                             census_division_name
                          ) VALUES(
                             %(name)s,
                             %(census_region_name)s,
                             %(census_division_name)s
                          )
              RETURNING id
              """

        statename_sql = """
              INSERT INTO state_names(
                             name,
                             state_id
                          ) VALUES(
                             %(name)s,
                             %(state_id)s
                          )
               RETURNING id
                        """

        with open('states.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile)
            
            for row in myCSVReader:
                cursor.execute(state_sql, row) 
                state_id = cursor.fetchone()['id']
                
                param_dict = { "name": row["name"],
                               "state_id": state_id }
                cursor.execute(statename_sql, param_dict)
                
                param_dict = { "name": row["abbreviation"],
                               "state_id": state_id }
                cursor.execute(statename_sql, param_dict)


# insert the salaries.csv data into the salaries table and clean the data

sql = """
       SELECT state_id
       FROM state_names
       WHERE name = %(State)s  
       """

with psycopg2.connect(host='localhost', dbname='newproject') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        with open('salaries.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile)
            
            for row in myCSVReader:
                import re
                cleaned_salary = re.sub('[\$,]','',row["Annual Salary"])
                cursor.execute(sql, row)
                
                if (cursor.rowcount > 0):
                    state_id = cursor.fetchone()['state_id']

                    param_dict = {"state_id": state_id,
                                  "annual_salary": cleaned_salary,
                                  "State": row["State"]}
                    cursor.execute("""
                                  INSERT INTO salaries(
                                                 name, 
                                                 annual_salary,
                                                 state_id
                                              ) VALUES (
                                                 %(State)s, 
                                                 %(annual_salary)s,
                                                 %(state_id)s
                                              )
                                  """,
                                   param_dict
                                  )


# insert the populations.csv data into the populations table

sql = """
       SELECT state_id
       FROM state_names
       WHERE name = %(State)s 
       """

with psycopg2.connect(host='localhost', dbname='newproject') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        with open('populations.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile)
            
            for row in myCSVReader:

                cursor.execute(sql, row)
                
                if (cursor.rowcount > 0):
                    state_id = cursor.fetchone()['state_id']

                    param_dict = {"state_id": state_id,
                                  "population_num": row["Pop"],
                                  "growth": row["Growth"],
                                  "State": row["State"]}
                    cursor.execute("""
                                  INSERT INTO populations(
                                                 name,
                                                 population_num,
                                                 growth,
                                                 state_id 
                                              ) VALUES (
                                                 %(State)s,
                                                 %(population_num)s,
                                                 %(growth)s,
                                                 %(state_id)s
                                              )
                                  """,
                                   param_dict
                                  )


# insert the colleges.csv data into the colleges table and clean the data

sql = """
       SELECT state_id
       FROM state_names
       WHERE name = %(State)s 
       """

with psycopg2.connect(host='localhost', dbname='newproject') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        with open('colleges.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile)
            
            for row in myCSVReader:
                import re
                cleaned_studentpop = re.sub('[\$,]','',row["Student Population"])
                cleaned_gradrate = re.sub('[\%,]','',row["Graduation Rate"])
                cleaned_gradrate2 = float(cleaned_gradrate) / 100
                cursor.execute(sql, row)
                
                if (cursor.rowcount > 0):
                    state_id = cursor.fetchone()['state_id']

                    param_dict = {'state_id': state_id,
                                  'student_pop': cleaned_studentpop,
                                  'graduation_rate': cleaned_gradrate2,
                                  'State': row['State']}
                    cursor.execute("""
                                  INSERT INTO colleges(
                                                 name, 
                                                 student_pop,
                                                 graduation_rate,
                                                 state_id
                                              ) VALUES (
                                                 %(State)s, 
                                                 %(student_pop)s,
                                                 %(graduation_rate)s,
                                                 %(state_id)s
                                              )
                                  """,
                                   param_dict
                                  )


# First analysis: compare population data to college data

import scipy.stats

sql = """
SELECT states.id AS id,
       state_names.name AS name,
       states.census_region_name AS region,
       states.census_division_name AS division,
       populations.population_num AS population, 
       populations.growth AS growth,
       colleges.student_pop AS student_population,
       populations.population_num / colleges.student_pop AS student_ratio,
       colleges.graduation_rate AS graduation_rate,
       salaries.annual_salary
       
FROM states
  JOIN state_names
    ON states.id = state_names.state_id
  JOIN populations
    ON states.id = populations.state_id
  JOIN colleges
    ON states.id = colleges.state_id
  JOIN salaries
    ON states.id = salaries.state_id
ORDER BY id
"""

with psycopg2.connect(host='localhost', dbname='newproject') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        cursor.execute(sql)
        
        student_ratio = []
        graduation_rate = []
        
        for row in cursor:
            student_ratio.append(row["student_ratio"])
            graduation_rate.append(row["graduation_rate"])

print("scipy.stats.pearsonr:")
print(scipy.stats.pearsonr(student_ratio,graduation_rate)[0])
print("--------")
print("scipy.stats.spearmanr:")
print(scipy.stats.spearmanr(student_ratio,graduation_rate)[0])

scipy.stats.pearsonr:
-0.3600290872795981
--------
scipy.stats.spearmanr:
-0.3823016760956874


# Second analysis: compare population data to salary data in different regions

%load_ext sql
%env DATABASE_URL=postgresql://localhost/newproject

env: DATABASE_URL=postgresql://localhost/newproject


%%sql
SELECT states.census_region_name AS region,     
       Sum(populations.population_num) AS population, 
       Sum(colleges.student_pop) AS student_population,
       Sum(salaries.annual_salary) / count(states.census_region_name) AS salary
       
FROM states
  JOIN state_names
    ON states.id = state_names.state_id
  JOIN populations
    ON states.id = populations.state_id
  JOIN colleges
    ON states.id = colleges.state_id
  JOIN salaries
    ON states.id = salaries.state_id

GROUP BY states.census_region_name 
ORDER BY states.census_region_name

4 rows affected.


pip install matplotlib


! pip install pandas


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


width = .55

m1_t = pd.DataFrame({
 'student_population' : [8361870, 6924086, 13898568, 10382710],
 'salary' : [60993, 63859, 58892, 62934]})
    
m1_t[['student_population']].plot(kind='bar', width = width, color = 'lightgray')
m1_t['salary'].plot(secondary_y=True, color = 'salmon')

ax = plt.gca()
plt.xlim([-width, len(m1_t['student_population'])-width])
ax.set_xticklabels(('Midwest','Northeast','South','West'))
plt.title("Student Population and Salary in Different Regions")
  

plt.show()

Anita Tsai's Project¶

Project Description¶

My project goals are:¶

Data Sources¶

CSV sceenshot of US States Population¶

CSV screenshot of US States College¶

CSV screenshot of US States Salary¶

CSV screenshot of US States Table¶

Workflow¶

Database Design¶

ER Diagram¶

Relational Vocab¶

Sample Tables¶

Manage Synonyms¶

The code used to load the data into the database¶

Analysis¶

Challenge¶

Analysis Tool Learing¶

region	population	student_population	salary
Midwest	137027512	8361870	60993
Northeast	111710338	6924086	63859
South	253776650	13898568	58892
West	158744328	10382710	62934