import pandas as pd
%matplotlib inline
Pandas can load data from many sources, such as text files, csv, excel. Here we load data from a web page.
tables = pd.read_html("https://www.basketball-reference.com/leagues/NBA_2018_games-november.html")
tables
contains a list with all the HTML tables found in the web page (in this case just one).
games = tables[0]
# games.to_csv("games.csv") # <- this can save the table in CSV format
games.head()
# Columns renaming
games.columns = ['date', 'start', 'away_team', 'away_points', 'home_team', 'home_points',
'd1', 'd2', 'attend', 'd3']
games.head()
# Drop unused columns and convert dates from string to datetime
games = (games.drop(columns=['d1', 'd2', 'd3'])
.assign(date=lambda x: pd.to_datetime(x['date'], format="%a, %b %d, %Y")))
games.head()
# Univariate analysis: histogram of away_points
# Note that pandas uses matplotlib under the hood
games.hist(column='away_points')
# Distributions of home_points vs. away_points
games.boxplot(['home_points', 'away_points'])
# Attendees over time for the home games of the Dallas Mavericks
games.loc[games['home_team'] == "Dallas Mavericks"].plot(x='date', y='attend')
# Group-by example
games.groupby(['home_team']).mean()