import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

conn = psycopg2.connect(host='localhost');

conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);
cursor = conn.cursor();

cursor.execute(
    '''
    CREATE DATABASE draft_mls_database
    '''
)

cursor.close()
conn.close()


import psycopg2
from psycopg2.extras import RealDictCursor

with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute('''
                       DROP TABLE IF EXISTS teams;
                       CREATE TABLE teams (id SERIAL PRIMARY KEY,
                                           standard_name TEXT,
                                           division TEXT,
                                           latitude NUMERIC,
                                           longitude NUMERIC);

                       DROP TABLE IF EXISTS matches;
                       CREATE TABLE matches (id SERIAL PRIMARY KEY,
                                             date_time_utc TIMESTAMP,
                                             home_team_id INTEGER,
                                             away_team_id INTEGER,
                                             home_score INTEGER,
                                             away_score INTEGER,
                                             tie_breaker TEXT,
                                             attendance INTEGER);

                       DROP TABLE IF EXISTS team_match_appearances;
                       CREATE TABLE team_match_appearances (id SERIAL PRIMARY KEY,
                                                            match_id INTEGER,
                                                            team_id INTEGER,
                                                            season INTEGER,
                                                            comp_stage TEXT,
                                                            home_away TEXT,
                                                            goals_for INTEGER,
                                                            goals_against INTEGER,
                                                            yellow_cards NUMERIC,
                                                            red_cards NUMERIC,
                                                            possession NUMERIC);

                        DROP TABLE IF EXISTS players;
                        CREATE TABLE players (id SERIAL PRIMARY KEY,
                                              first_name TEXT,
                                              last_name TEXT);

                        DROP TABLE IF EXISTS players_teams;
                        CREATE TABLE players_teams (id SERIAL PRIMARY KEY,
                                                    player_id TEXT,
                                                    team_id INTEGER,
                                                    season INTEGER,
                                                    compensation NUMERIC);           

                        DROP TABLE IF EXISTS team_names;
                        CREATE TABLE team_names (id SERIAL PRIMARY KEY,
                                                 team_id INTEGER,
                                                 name TEXT);         
                        ''')


import psycopg2
from psycopg2.extras import RealDictCursor
import csv

with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        with open('2021_Stadiums.csv', encoding="utf-8-sig") as csvfile:
            myCSVReader = csv.DictReader(csvfile)

            sql_select_teams = """
                               SELECT id
                               FROM teams
                               WHERE standard_name = %(Team)s
                               """
            
            sql_select_team_names = """
                                    SELECT id 
                                    FROM team_names
                                    WHERE name = %(Team)s
                                    """

            sql_insert_team = """
                              INSERT INTO teams(standard_name,
                                                division,
                                                latitude,
                                                longitude) 
                              VALUES (%(Team)s,
                                      %(Division)s,
                                      %(Lat)s,
                                      %(Long)s)
                              RETURNING id
                              """
            sql_insert_team_name = """
                                   INSERT INTO team_names(team_id,
                                                          name) 
                                   VALUES (%(team_id)s,
                                           %(name)s)
                                   """
                        
            for row in myCSVReader:
                if row['League'] == 'MLS':            #dataset contains other sports leagues, but I only want MLS.
                    cursor.execute(sql_select_teams, row)
                    results = cursor.fetchall()
                    if (len(results) == 0):           #avoiding duplicate in teams table.         
                        cursor.execute(sql_insert_team, row)    #write data to teams table if not already present.
                        results = cursor.fetchall()
                        team_id = results[0]["id"]        #return the new teams.id and assign to variable team_id

                        param_dict = {"team_id": team_id,
                                  "name": row["Team"]}
                    
                        cursor.execute(sql_select_team_names, row)
                        if (cursor.rowcount == 0):
                            cursor.execute(sql_insert_team_name, param_dict)  #write team name variant to team_names along with corresponding team_id


import psycopg2
from psycopg2.extras import RealDictCursor
import csv

with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:     

        sql_team_id_menu = """
                           SELECT id, standard_name
                           FROM teams
                           ORDER BY standard_name
                           """        
               
        print("Current list of standard MLS team names:")
        
        cursor.execute(sql_team_id_menu, row)
        results = cursor.fetchall()
        for menu_item in results:
            print(menu_item["id"], menu_item["standard_name"])  #print current list of standard team names and ids to serve
                                                                #as a menu reference if any new variants need to be matched.
        print("=======================================\n")      
    
        with open('MLS_2021_Player_Salaries.csv', encoding="utf-8-sig") as csvfile:
            myCSVReader = csv.DictReader(csvfile)

            print("_VERIFYING TEAM NAMES IN 'MLS_2021_Player_Salaries.csv'.\n")               
            
            sql_select_player_team_id = """
                                        SELECT team_id 
                                        FROM team_names
                                        WHERE name = %(Club)s
                                        """
         
            sql_insert_team_name = """
                                   INSERT INTO team_names(team_id,
                                                          name) 
                                   VALUES (%(team_id)s,
                                           %(NameVariant)s)
                                   """

            skiplist = []        #initializing a list to temporarily remember the names I pass on
                                 #so I won't have to repeat it for every iteration.
            for row in myCSVReader:
                cursor.execute(sql_select_player_team_id, row)
                results = cursor.fetchall()
                if (len(results) == 0):        #checking if team name is in team_names already.
                    if row["Club"] in skiplist:  #if not, check if it's already been recorded as passed in my skiplist.
                        pass
                    else:
                        #ask user to compare new team name variant with standard team name list and select the appropriate matching id.
                        response = input("\n>>> " + "'" + row["Club"] + "'" + " is not in your list of standard team names.\n choose an appropriate matching team id from the list above \n or enter PASS to ignore this team: ").upper()
                        if response == "PASS":
                            skiplist.append(row["Club"])     #if user passes, add varaint to skiplist so it won't come up again.
                        else:
                            team_id = response
                            param_dict = {"team_id": team_id,
                                          "NameVariant": row["Club"]}
                            cursor.execute(sql_insert_team_name, param_dict)  #write new team name variant to team_names along with user selected team_id match.
                            print(">New team name variant added.\n")
                else:
                    pass
            
            print("\n_'MLS_2021_Player_Salaries.csv' TEAM NAME VERIFICATION COMPLETED.")

        with open('MLS_Matches.csv', encoding="utf-8-sig") as csvfile:
            myCSVReader = csv.DictReader(csvfile)

            print("\n_VERIFYING TEAM NAMES IN 'MLS_Matches.csv'.\n")            
            
            sql_select_home_team_id = """
                                      SELECT team_id 
                                      FROM team_names
                                      WHERE name = %(home)s
                                      """
            
            sql_select_away_team_id = """
                                      SELECT team_id 
                                      FROM team_names
                                      WHERE name = %(away)s
                                      """
         
            sql_insert_team_name = """
                                   INSERT INTO team_names(team_id,
                                                          name) 
                                   VALUES (%(team_id)s,
                                           %(NameVariant)s)
                                   """
            skiplist = []        #initializing a list to temporarily remember the names I pass on
                                 #so I won't have to repeat it for every iteration.
            
            for row in myCSVReader:
                if row["league"] == "2021 Major League Soccer":
                    cursor.execute(sql_select_home_team_id, row)
                    results = cursor.fetchall()
                    if (len(results) == 0):            #checking if team name is in team_names already.
                        if row["home"] in skiplist:  #if not, check if it's already been recorded as passed in my skiplist.
                            pass
                        else:
                            #ask user to compare new team name variant with standard team name list and select the appropriate matching id.                        
                            response = input("\n>>> " + "'" + row["home"] + "'" + " is not in your list of standard team names.\n choose an appropriate matching team id from the list above \n or enter PASS to ignore this team: ").upper()
                            if response == "PASS":       
                                skiplist.append(row["home"])     #if user passes, add varaint to skiplist so it won't come up again.
                            else:
                                team_id = response
                                param_dict = {"team_id": team_id,
                                              "NameVariant": row["home"]}
                                cursor.execute(sql_insert_team_name, param_dict)
                                print("New team name variant added.\n")
                    else:
                        pass
 
                    cursor.execute(sql_select_away_team_id, row)
                    results = cursor.fetchall()
                    if (len(results) == 0):
                        if row["away"] in skiplist:  #if not, check if it's already been recorded as passed in my skiplist.
                            pass
                        else:
                            #ask user to compare new team name variant with standard team name list and select the appropriate matching id.          
                            response = input("\n>>> " + "'" + row["away"] + "'" + " is not in your list of standard team names.\n choose an appropriate matching team id from the list above \n or enter PASS to ignore this team: ").upper()
                            if response == "PASS":
                                skiplist.append(row["away"])     #if user passes, add varaint to skiplist so it won't come up again.
                            else:
                                team_id = response
                                param_dict = {"team_id": team_id,
                                              "NameVariant": row["away"]}
                                cursor.execute(sql_insert_team_name, param_dict)
                                print("New team name variant added.\n")
                    else:
                        pass
                    
            print("_'MLS_Matches.csv' TEAM NAME VERIFIFICATION COMPLETED.")

Current list of standard MLS team names:
14 Atlanta United FC
19 Austin FC
22 CF Montréal
23 Chicago Fire FC
7 Colorado Rapids
12 Columbus Crew
2 D.C. United
26 FC Cincinnati
25 FC Dallas
17 Houston Dynamo FC
9 Inter Miami CF
8 LA Galaxy
3 Los Angeles FC
1 Minnesota United FC
15 Nashville SC
11 New England Revolution
27 New York City FC
20 New York Red Bulls
10 Orlando City SC
24 Philadelphia Union
18 Portland Timbers
21 Real Salt Lake
16 San Jose Earthquakes
13 Seattle Sounders FC
6 Sporting Kansas City
5 Toronto FC
4 Vancouver Whitecaps FC
=======================================

_VERIFYING TEAM NAMES IN 'MLS_2021_Player_Salaries.csv'.

>New team name variant added.

>New team name variant added.

>New team name variant added.

>New team name variant added.

>New team name variant added.

>New team name variant added.

>New team name variant added.

>New team name variant added.

>New team name variant added.


import psycopg2
from psycopg2.extras import RealDictCursor
import csv
import re

with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        with open('MLS_2021_Player_Salaries.csv', encoding="utf-8-sig") as csvfile:
            myCSVReader = csv.DictReader(csvfile)

            sql_select_player_team_id = """
                                        SELECT team_id from team_names
                                        WHERE name = %(Club)s
                                        """
            
            sql_insert_player = """
                                INSERT INTO players(first_name,
                                                    last_name) 
                                       VALUES (%(first_name)s,
                                                %(last_name)s)
                                RETURNING id
                                """
            sql_insert_player_team = """
                                     INSERT INTO players_teams(player_id,
                                                               team_id,
                                                               season,
                                                               compensation)
                                     VALUES (%(player_id)s,
                                             %(team_id)s,
                                             %(season)s,
                                             %(compensation)s)
                                     """
            
            for row in myCSVReader:
                cursor.execute(sql_select_player_team_id, row)
                results = cursor.fetchall()
                if (len(results) == 0):        #verify player's team is in teams table and only proceed if so.
                    pass                      #player data includes players hired to not-yet active teams or non-specific 'mls' assigments.
                else:                     
                    param_dict_player = {"first_name": row["First Name"],
                                          "last_name": row["Last Name"]}

                    cursor.execute(sql_insert_player, param_dict_player)
                    new_player_id = cursor.fetchone()['id']     #write player name to players table and fetch the new id

                    cursor.execute(sql_select_player_team_id, row)
                    results = cursor.fetchall()
                    player_team_id = results[0]["team_id"]    #fetch matching team_id from team_names
                    
                    season = "2021"           #current dataset only pertains to 2021. making that explicit in case future data comes along.
                    cleaned_compensation = re.sub('[\$,]','',row["Guaranteed Compensation"])  #remove dollar sign and commas from compensation data.

                    param_dict_playerteam = {"player_id": new_player_id,
                                             "team_id": player_team_id,
                                             "season": season,
                                             "compensation": cleaned_compensation}
             
                    cursor.execute(sql_insert_player_team, param_dict_playerteam) #write player team assignment info to players_teams


import psycopg2
from psycopg2.extras import RealDictCursor
import csv
import re
from datetime import datetime

with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        sql_select_hometeam_id = """
                                 SELECT team_id from team_names
                                 WHERE name = %(home)s
                                 """

        sql_select_awayteam_id = """
                                 SELECT team_id from team_names
                                 WHERE name = %(away)s
                                 """

        sql_insert_matches = """
                             INSERT INTO matches(date_time_utc, 
                                                 home_team_id,
                                                 away_team_id,
                                                 home_score,
                                                 away_score,
                                                 tie_breaker, 
                                                 attendance) 
                             VALUES (%(date_time_utc)s,
                                     %(home_team_id)s,
                                     %(away_team_id)s,
                                     %(home_score)s,
                                     %(away_score)s,
                                     %(tie_breaker)s,
                                     %(attendance)s)
                             RETURNING id
                             """
        
        sql_insert_team_match_apps = """
                                     INSERT INTO team_match_appearances(season,
                                                                        comp_stage,
                                                                        match_id,
                                                                        team_id,
                                                                        home_away,
                                                                        goals_for,
                                                                        goals_against,
                                                                        red_cards,
                                                                        yellow_cards,
                                                                        possession) 
                                     VALUES (%(season)s,
                                             %(comp_stage)s,
                                             %(match_id)s,
                                             %(team_id)s,
                                             %(home_away)s,
                                             %(goals_for)s,
                                             %(goals_against)s,
                                             %(red_cards)s,
                                             %(yellow_cards)s,
                                             %(possession)s)
                                     """        
        
        with open('MLS_Matches.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile)

            for row in myCSVReader:
                if row["league"] == "2021 Major League Soccer":
                    
                    cursor.execute(sql_select_hometeam_id, row)
                    results = cursor.fetchall()
                    hometeam_id = results[0]["team_id"]

                    cursor.execute(sql_select_awayteam_id, row)
                    results = cursor.fetchall()
                    awayteam_id = results[0]["team_id"]
                    
                    home_possession = float(row["home_possessionPct"][:-1])  #remove '%' from possession data so it can be counted as a number
                    away_possession = float(row["away_possessionPct"][:-1])   
                    
                    if len(row["attendance"]) == 0:
                        attendance = 0
                    else:
                        attendance = int(row["attendance"].replace(',', ''))  #remove commas from attendance data so it can be counted as a number
                        
                    if row["shootout"] == "True":
                        tiebreaker = "shootout"
                    else: 
                        tiebreaker = "none"
                    
                    temp_match_time = (row["date"] + ", " + row["year"] + " " + row["time (utc)"])   #merge date, year, and time fields into one string, then...
                    match_time = datetime.strptime(temp_match_time, "%A, %B %d, %Y %H:%M")    #convert date/time string into standard timestamp format

                    param_dict_matches = {"date_time_utc": match_time,
                                          "home_team_id": hometeam_id,
                                          "away_team_id": awayteam_id,
                                          "home_score": row["home_score"].strip(),  #some fields seem to have extra leading/trailing spaces, 
                                          "away_score": row["away_score"].strip(),  #using .strip() function as a precaution.
                                          "tie_breaker": tiebreaker,
                                          "attendance": attendance}

                    cursor.execute(sql_insert_matches, param_dict_matches)
                    new_match_id = cursor.fetchone()['id']
                    #print("New id for match: {}".format(new_match_id))
                    
                    param_dict_home = {"season": row["year"],
                                       "comp_stage": row["part_of_competition"].strip(), 
                                       "match_id": new_match_id,
                                       "team_id": hometeam_id,
                                       "home_away": "home", 
                                       "goals_for": row["home_score"].strip(),
                                       "goals_against": row["away_score"].strip(),
                                       "red_cards": row["home_redCards"].strip(),
                                       "yellow_cards": row["home_yellowCards"].strip(),
                                       "possession": home_possession}  
                    
                    param_dict_away = {"season": row["year"],
                                       "comp_stage": row["part_of_competition"].strip(),
                                       "match_id": new_match_id,
                                       "team_id": awayteam_id,
                                       "home_away": "away", 
                                       "goals_for": row["away_score"].strip(),
                                       "goals_against": row["home_score"].strip(),
                                       "red_cards": row["away_redCards"].strip(),
                                       "yellow_cards": row["away_yellowCards"].strip(),                                    
                                       "possession": away_possession} 
                    
                    cursor.execute(sql_insert_team_match_apps, param_dict_home)
                    
                    cursor.execute(sql_insert_team_match_apps, param_dict_away)


import psycopg2
from psycopg2.extras import RealDictCursor
import csv


with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute("""
                       SELECT teams.standard_name AS team, 
                              SUM(players_teams.compensation) AS total_compensation,
                              teams.latitude AS latitude,
                              teams.longitude AS longitude    
                       FROM teams
                         JOIN players_teams
                           ON teams.id = players_teams.team_id
                       GROUP BY team, latitude, longitude
                       ORDER BY total_compensation DESC
                       """
                       )
        
        with open('OUT-Team_Compensation_Location.csv', 'w',encoding="utf-8-sig") as csvfile:
            column_names = ["team", "total_compensation", "latitude", "longitude"]
            myCsvWriter = csv.DictWriter(csvfile, fieldnames=column_names)

            myCsvWriter.writeheader()

            for row in cursor:        
                myCsvWriter.writerow(row)

Done writing csv


import psycopg2
from psycopg2.extras import RealDictCursor
import csv


with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute("""
                       SELECT team,
                              total_compensation,
                              SUM(team_match_appearances.goals_for) AS goals_scored,
                              SUM(team_match_appearances.goals_against) AS goals_conceded,
                              (SUM(team_match_appearances.goals_for) - SUM(team_match_appearances.goals_against)) AS differential       
                       FROM (
                             SELECT teams.id AS team_id,
                                    teams.standard_name AS team,
                                    SUM(players_teams.compensation) AS total_compensation
                             FROM teams
                               JOIN players_teams
                                 ON teams.id = players_teams.team_id
                             GROUP BY teams.id, team
                             ) subquery
                         JOIN team_match_appearances
                           ON subquery.team_id = team_match_appearances.team_id
                       WHERE team_match_appearances.comp_stage = 'Regular Season'
                       GROUP BY team, total_compensation
                       ORDER BY total_compensation DESC
                      """
                      )
        
        with open('OUT-Team_Compensation_Performance.csv', 'w', encoding="utf-8-sig") as csvfile:
            column_names = ["team", "total_compensation", "goals_scored", "goals_conceded", "differential"]
            myCsvWriter = csv.DictWriter(csvfile, fieldnames=column_names)

            myCsvWriter.writeheader()

            for row in cursor:        
                myCsvWriter.writerow(row)

Done writing csv


import psycopg2
from psycopg2.extras import RealDictCursor
import csv


with psycopg2.connect(host='localhost', dbname='draft_mls_database') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute("""
                        SELECT host_stadium,
                               latitude,
                               longitude,
                               COUNT(matches.home_score) AS match_count, 
                               SUM(matches.home_score) AS total_home_goals,
                               SUM(matches.away_score) AS total_away_goals,
                               (SUM(matches.home_score) + SUM(matches.away_score)) AS all_goals,
                               (SUM(matches.home_score) - SUM(matches.away_score)) AS differential                                
                        FROM (
                              SELECT teams.id AS team_id,
                                     teams.standard_name AS host_stadium, 
                                     teams.latitude AS latitude,
                                     teams.longitude AS longitude    
                              FROM teams
                              GROUP BY teams.id, host_stadium, latitude, longitude
                             ) subquery
                        JOIN matches
                          ON subquery.team_id = matches.home_team_id
                        GROUP BY host_stadium, latitude, longitude
                       """
                       )
        
        with open('OUT-MatchLocation_GoalCounts.csv', 'w', encoding="utf-8-sig") as csvfile:
            column_names = ["host_stadium", "latitude", "longitude", "match_count", "total_home_goals", "total_away_goals", "all_goals", "differential"]
            myCsvWriter = csv.DictWriter(csvfile, fieldnames=column_names)

            myCsvWriter.writeheader()

            for row in cursor:           
                myCsvWriter.writerow(row)

Done writing csv

INF 385T.9: Data Wrangling¶

Final Project Report: Geographic & Economic Correlations with Competitive Success in Major League Soccer¶

Anon¶

Spring 2022¶

__¶

Project Summary¶

Data Sources¶

Workflow¶

ER Diagram¶

_¶

Loading Data Into My Database¶

>Create my database¶

>Create each of my tables according to my ER Diagram sketch¶

>Read '2021_Stadiums.csv' and write data to the teams table.¶

Also, add each standard name variant to the team_names table along with its corresponding teams.id¶

Verifying data load with a SQL notebook:¶

>Review team names in the rest of my datasets and add new team name variants to team_names when applicable.¶

This is the extra unnecessary side project I described previously, which is meant to replace the manually created name variant spreadsheet and allow for a bit more consistent automated process.¶

Verify new team name variants are populating team_names via SQL notebook:¶

>Populate players table with names from MLS_2021_Player_Salaries.csv. Add player id and team assignment data to players_teams table.¶

Verify players and players_teams table population with SQL notebook:¶

>Populate matches table and team_match_appearances table with data from matches.csv -¶

matching relevant team_id for each home and away team¶

Verify matches and team_match_appearances table population with SQL notebook¶

__¶

EXPORTING FOR ANALYSIS¶

CSV Export #1: Team Salaries vs Geographic Location¶

CSV Export #2: Team Salaries vs Regular Season Match Performance¶

CSV Export #3: Goal Counts by Stadium Location¶

Visual Analysis¶

Comparing total team compensation rates from east to west and south to north.¶

Comparing total team compensation with cumulative goal output and cumulative goals conceded.¶

Evaluation of Homefield Advantage - Goals Scored vs Goals Conceded at Home¶

Takeaways from these charts:¶

My Chosen Analysis Tool: Tableau¶

__¶

Challenges¶