albertlyu / ncaab-pbp

A Python project for scraping NCAA men's basketball play-by-play data
MIT License
3 stars 0 forks source link

Design data model for NCAA pbp data #1

Closed albertlyu closed 10 years ago

albertlyu commented 10 years ago

Let's assume we store the JSON files to disk for now. Design a data model and a database schema to intelligently store values from the parsed JSON. Tables for players, games, teams, and plays are needed at the very least. We need an intelligent way to handle made shots that were assisted, any instances of 'player 2,' etc. Once this is completed, we can work on #2 and decide the open source RDBMS to use that our schema is most tailored for, then write stored procedures to accommodate the data model.

albertlyu commented 10 years ago

This is my first stab at a sqlite3 database schema. Storing the JSON in a field as well in case there's useful metadata to be had. I'd still like to consider PostgreSQL as the database though because of the upcoming 9.4 release, and the possibility of storing JSON not just as a text field, but eventually as a full document store (http://www.craigkerstiens.com/2014/03/24/Postgres-9.4-Looking-up/).

import sqlite3
import simplejson as json

db = sqlite3.connect("../data/pbp.db")
c = db.cursor()

c.execute('''CREATE TABLE games (
    game_id INTEGER PRIMARY KEY,
    date NUMERIC,
    FOREIGN KEY(home_team_id) REFERENCES teams(team_id),
    FOREIGN KEY(away_team_id) REFERENCES teams(team_id),
    json BLOB
    )
''')

c.execute('''CREATE TABLE players (
    player_id INTEGER PRIMARY KEY,
    player_first_name TEXT,
    player_last_name TEXT,
    FOREIGN KEY(player_team_id) REFERENCES teams(team_id) 
    )
''')

c.execute('''CREATE TABLE teams (
    team_id INTEGER PRIMARY KEY,
    team_alias TEXT
    )
''')

c.execute('''CREATE TABLE lkup_shot (
    detail_id INTEGER PRIMARY KEY,
    detail_desc TEXT
    )
''')

c.execute('''CREATE TABLE lkup_event (
    event_id INTEGER PRIMARY KEY,
    event_desc TEXT
    )
''')

c.execute('''CREATE TABLE plays (
    play_id INTEGER PRIMARY KEY, # insert NULL into this column so that it autoincrements
    FOREIGN KEY(game_id) REFERENCES games(game_id),
    half INTEGER,
    time_minutes INTEGER,
    time_seconds INTEGER,
    details TEXT,
    player_id_1 INTEGER,
    player_id_2 INTEGER,
    player_id_1_linkable NUMERIC,
    player_id_2_linkable NUMERIC,
    player_id_3_linkable NUMERIC,
    player_first_name_1 TEXT,
    player_first_name_2 TEXT,
    player_last_name_1 TEXT,
    player_last_name_2 TEXT,
    player_team_alias_1 TEXT,
    player_team_alias_2 TEXT,
    home_score INTEGER,
    visitor_score INTEGER,
    visitor_fouls INTEGER,
    home_fouls INTEGER,
    player_fouls INTEGER,
    fastbreak INTEGER,
    in_paint INTEGER,
    second_chance INTEGER,
    off_turnover INTEGER,
    player_score INTEGER,
    points_type INTEGER,
    FOREIGN KEY(detail_id) REFERENCES lkup_shot(detail_id),
    FOREIGN KEY(event_id) REFERENCES lkup_event(event_id),
    distance INTEGER,
    x_coord REAL,
    y_coord REAL,
    FOREIGN KEY(team_id_1) REFERENCES teams(team_id),
    FOREIGN KEY(team_id_2) REFERENCES teams(team_id),
    FOREIGN KEY(team_id_3) REFERENCES teams(team_id),
    json BLOB
    )
''')

c.execute('''CREATE TABLE shots (
    shot_id INTEGER PRIMARY KEY,
    FOREIGN KEY(play_id) REFERENCES plays(play_id),
    FOREIGN KEY(game_id) REFERENCES games(game_id),
    FOREIGN KEY(team_id) REFERENCES teams(team_id),
    FOREIGN KEY(home_team_id) REFERENCES teams(team_id),
    FOREIGN KEY(away_team_id) REFERENCES teams(team_id),
    FOREIGN KEY(player_id) REFERENCES players(player_id),
    FOREIGN KEY(player_id_assist) REFERENCES players(player_id),
    #block_name TEXT, # may not be available with ncaab pbp
    FOREIGN KEY(detail_id) REFERENCES lkup_shot(detail_id),
    FOREIGN KEY(event_id) REFERENCES lkup_event(event_id),
    details TEXT,
    points_type INTEGER,
    made NUMERIC,
    x REAL,
    y REAL,
    json BLOB
    )
''')
albertlyu commented 10 years ago

Update:

import sqlite3

db = sqlite3.connect("../data/pbp.db")
c = db.cursor()

# DROP TABLE IF EXISTS tables
c.execute('''DROP TABLE IF EXISTS teams''')
c.execute('''DROP TABLE IF EXISTS games''')
c.execute('''DROP TABLE IF EXISTS players''')
c.execute('''DROP TABLE IF EXISTS lkup_shot''')
c.execute('''DROP TABLE IF EXISTS lkup_event''')
c.execute('''DROP TABLE IF EXISTS plays''')
c.execute('''DROP TABLE IF EXISTS shots''')

# INSERT OR IGNORE INTO teams (team_id, team_alias, team_name, team_mascot, team_div, team_conf) VALUES ()
c.execute('''CREATE TABLE teams (
    team_id INTEGER PRIMARY KEY,
    team_alias TEXT UNIQUE,
    team_name TEXT UNIQUE,
    team_mascot TEXT,
    team_div TEXT,
    team_conf TEXT
    )
''')

# INSERT OR IGNORE INTO games (game_id, date, year, home_team_id, away_team_id, venue_name, venue_city, venue_state) VALUES ()
c.execute('''CREATE TABLE games (
    game_id INTEGER PRIMARY KEY,
    date NUMERIC,
    year NUMERIC,
    home_team_id INTEGER,
    away_team_id INTEGER,
    venue_name TEXT,
    venue_city TEXT,
    venue_state TEXT,
    FOREIGN KEY(home_team_id) REFERENCES teams(team_id),
    FOREIGN KEY(away_team_id) REFERENCES teams(team_id)
    )
''')

# INSERT OR IGNORE INTO players (player_id, player_first_name, player_last_name, player_team_id) VALUES ()
c.execute('''CREATE TABLE players (
    player_id INTEGER PRIMARY KEY,
    player_first_name TEXT,
    player_last_name TEXT,
    player_team_id INTEGER,
    FOREIGN KEY(player_team_id) REFERENCES teams(team_id) 
    )
''')

# manual inserts
#c.execute('''CREATE TABLE lkup_shot (
#   detail_id INTEGER PRIMARY KEY,
#   detail_desc TEXT
#   )
#''')

# manual inserts
#c.execute('''CREATE TABLE lkup_event (
#   event_id INTEGER PRIMARY KEY,
#   event_desc TEXT
#   )
#''')

# insert NULL into play_id so that it autoincrements
c.execute('''CREATE TABLE plays (
    play_id INTEGER PRIMARY KEY,
    game_id INTEGER,
    half INTEGER,
    time_minutes INTEGER,
    time_seconds INTEGER,
    details TEXT,
    player_id_1 INTEGER,
    player_id_2 INTEGER,
    player_id_1_linkable NUMERIC,
    player_id_2_linkable NUMERIC,
    player_id_3_linkable NUMERIC,
    player_first_name_1 TEXT,
    player_first_name_2 TEXT,
    player_last_name_1 TEXT,
    player_last_name_2 TEXT,
    home_score INTEGER,
    visitor_score INTEGER,
    visitor_fouls INTEGER,
    home_fouls INTEGER,
    player_fouls INTEGER,
    player_score INTEGER,
    points_type INTEGER,
    detail_desc TEXT,
    event_desc TEXT,
    distance INTEGER,
    x_coord REAL,
    y_coord REAL,
    team_id_1 INTEGER,
    team_id_2 INTEGER,
    team_id_3 INTEGER,
    FOREIGN KEY(game_id) REFERENCES games(game_id),
    FOREIGN KEY(team_id_1) REFERENCES teams(team_id),
    FOREIGN KEY(team_id_2) REFERENCES teams(team_id),
    FOREIGN KEY(team_id_3) REFERENCES teams(team_id)
    )
''')

# insert NULL into shot_id so that it autoincrements
c.execute('''CREATE TABLE shots (
    shot_id INTEGER PRIMARY KEY,
    game_id INTEGER,
    half INTEGER,
    time_minutes INTEGER,
    time_seconds INTEGER,
    details TEXT,
    player_id INTEGER,
    player_id_assist INTEGER,
    home_score INTEGER,
    visitor_score INTEGER,
    visitor_fouls INTEGER,
    home_fouls INTEGER,
    player_fouls INTEGER,
    player_score INTEGER,
    points_type INTEGER,
    detail_desc TEXT,
    event_desc TEXT,
    distance INTEGER,
    x_coord REAL,
    y_coord REAL,
    team_id INTEGER,
    FOREIGN KEY(game_id) REFERENCES games(game_id),
    FOREIGN KEY(team_id) REFERENCES teams(team_id),
    FOREIGN KEY(player_id) REFERENCES players(player_id),
    FOREIGN KEY(player_id_assist) REFERENCES players(player_id)
    )
''')
albertlyu commented 10 years ago

Resolved via https://github.com/albertlyu/ncaab-pbp/commit/37713545a0d0804525180824663f68f64136b56f.