Table of Contents
This package fetches and parses event data for Major League Baseball games. Game objects generated via the _from_url methods pull data from this MLBAM endpoint where events are published within about 30 seconds of occurring. This XML source data zip file contains event data from MLB games 2008-2017.
pip3 install baseball
git clone git@github.com:benjamincrom/baseball.git
cd baseball/
python3 setup.py install
Fetch an object which contains metadata and events for a single MLB game.
import baseball
game_id, game = baseball.get_game_from_url('2017-11-1', 'HOU', 'LAD', 1)
game_dict = game._asdict()
game_json_str = game.json()
Write scorecard as SVG image:
with open(game_id + '.svg', 'w') as fh:
fh.write(game.get_svg_str())
2017-11-01-HOU-LAD-1.svg
Fetch a list of game objects which each contain metadata and events for a single MLB game.
First, download and unzip the source data zip file:
wget https://spaces-host.nyc3.digitaloceanspaces.com/livebaseballscorecards-artifacts/baseball_files_2008-2017.zip
unzip baseball_files_2008-2017.zip -d ./baseball_files_2008-2017
Then import the files in Python using this library:
import baseball
game_tuple_list = baseball.get_game_list_from_file_range('1-1-2017', '12-31-2017', 'baseball_files_2008-2017')
get_game_generator_from_file_range(__start_date_str, end_date_str, input_dir)__
Returns generator which yields (game_id, Game) tuples
get_game_xml_from_url(date_str, away_code, home_code, game_number)
Returns game_id and three strings containing XML documents: (game_id, boxscore_raw_xml, players_raw_xml, inning_raw_xml)
get_game_from_xml_strings(boxscore_raw_xml, players_raw_xml, inning_raw_xml)
Returns Game object if enough information to create one is provided. Otherwise returns None.
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import baseball
%matplotlib inline
game_id, game = baseball.get_game_from_url('11-1-2017', 'HOU', 'LAD', 1)
pitch_tuple_list = []
for inning in game.inning_list:
for appearance in inning.top_half_appearance_list:
for event in appearance.event_list:
if isinstance(event, baseball.Pitch):
pitch_tuple_list.append(
(str(appearance.pitcher),
event.pitch_description,
event.pitch_position,
event.pitch_speed,
event.pitch_type)
)
data = pd.DataFrame(data=pitch_tuple_list, columns=['Pitcher', 'Pitch Description', 'Pitch Coordinate', 'Pitch Speed', 'Pitch Type'])
data.head()
Pitcher | Pitch Description | Pitch Coordinate | Pitch Speed | Pitch Type | |
---|---|---|---|---|---|
0 | 21 Yu Darvish | Ball | (155.47, 160.83) | 96.0 | FF |
1 | 21 Yu Darvish | Called Strike | (107.0, 171.09) | 83.9 | FC |
2 | 21 Yu Darvish | In play, no out | (115.36, 183.1) | 83.9 | SL |
3 | 21 Yu Darvish | In play, run(s) | (80.06, 168.03) | 96.6 | FF |
4 | 21 Yu Darvish | Ball | (54.1, 216.52) | 84.6 | SL |
data['Pitcher'].value_counts().plot.bar()
for pitcher in data['Pitcher'].unique():
plt.ylim(0, 125)
plt.xlim(0, 250)
bx = [250 - x[2][0] for x in pitch_tuple_list if x[0] == pitcher if 'Ball' in x[1]]
by = [250 - x[2][1] for x in pitch_tuple_list if x[0] == pitcher if 'Ball' in x[1]]
cx = [250 - x[2][0] for x in pitch_tuple_list if x[0] == pitcher if 'Called Strike' in x[1]]
cy = [250 - x[2][1] for x in pitch_tuple_list if x[0] == pitcher if 'Called Strike' in x[1]]
ox = [250 - x[2][0] for x in pitch_tuple_list if x[0] == pitcher if ('Ball' not in x[1] and 'Called Strike' not in x[1])]
oy = [250 - x[2][1] for x in pitch_tuple_list if x[0] == pitcher if ('Ball' not in x[1] and 'Called Strike' not in x[1])]
b = plt.scatter(bx, by, c='b')
c = plt.scatter(cx, cy, c='r')
o = plt.scatter(ox, oy, c='g')
plt.legend((b, c, o),
('Ball', 'Called Strike', 'Other'),
scatterpoints=1,
loc='upper right',
ncol=1,
fontsize=8)
plt.title(pitcher)
plt.show()
plt.axis('equal')
data['Pitch Description'].value_counts().plot(kind='pie', radius=1.5, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)
data.plot.kde()
fig, ax = plt.subplots()
ax.set_xlim(50, 120)
for pitcher in data['Pitcher'].unique():
s = data[data['Pitcher'] == pitcher]['Pitch Speed']
s.plot.kde(ax=ax, label=pitcher)
ax.legend()
fig, ax = plt.subplots()
ax.set_xlim(50, 120)
for desc in data['Pitch Type'].unique():
s = data[data['Pitch Type'] == desc]['Pitch Speed']
s.plot.kde(ax=ax, label=desc)
ax.legend()
fig, ax = plt.subplots(figsize=(15,7))
data.groupby(['Pitcher', 'Pitch Description']).size().unstack().plot.bar(ax=ax)
game_list_2017 = baseball.get_game_list_from_file_range('1-1-2017', '12-31-2017', '/Users/benjamincrom/repos/livebaseballscorecards-artifacts/baseball_files')
pitch_tuple_list_2 = []
for game_id, game in game_list_2017:
if game.home_team.name == 'Atlanta Braves' or game.away_team.name == 'Atlanta Braves':
for inning in game.inning_list:
for appearance in (inning.top_half_appearance_list +
(inning.bottom_half_appearance_list or [])):
if 'Dickey' in str(appearance.pitcher):
for event in appearance.event_list:
if isinstance(event, baseball.Pitch):
pitch_tuple_list_2.append(
(str(appearance.pitcher),
event.pitch_description,
event.pitch_position,
event.pitch_speed,
event.pitch_type)
)
df = pd.DataFrame(data=pitch_tuple_list_2, columns=['Pitcher', 'Pitch Description', 'Pitch Coordinate', 'Pitch Speed', 'Pitch Type'])
df['Pitch Type'].value_counts().plot.bar()
plt.axis('equal')
df['Pitch Description'].value_counts().plot(kind='pie', radius=2, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)
plt.ylabel('')
plt.show()
df.dropna(inplace=True)
ax.set_xlim(50, 100)
df.plot.kde()
ax.legend()
fig, ax = plt.subplots()
ax.set_xlim(50, 100)
for desc in df['Pitch Type'].unique():
if desc != 'PO':
s = df[df['Pitch Type'] == desc]['Pitch Speed']
s.plot.kde(ax=ax, label=desc)
ax.legend()
import datetime
import dateutil.parser
import pytz
pitch_tuple_list_3 = []
for game_id, game in game_list_2017:
if game.home_team.name == 'Atlanta Braves' and dateutil.parser.parse(game.game_date_str) > datetime.datetime(2017, 3, 31):
for inning in game.inning_list:
for appearance in inning.top_half_appearance_list:
pitch_tuple_list_3.append(
(str(appearance.pitcher),
str(appearance.batter),
len(appearance.out_runners_list),
len(appearance.scoring_runners_list),
len(appearance.runners_batted_in_list),
appearance.scorecard_summary,
appearance.got_on_base,
appearance.plate_appearance_summary,
appearance.plate_appearance_description,
appearance.error_str,
appearance.inning_outs)
)
if game.away_team.name == 'Atlanta Braves' and dateutil.parser.parse(game.game_date_str) > datetime.datetime(2017, 3, 31):
for inning in game.inning_list:
if inning.bottom_half_appearance_list:
for appearance in inning.bottom_half_appearance_list:
pitch_tuple_list_3.append(
(str(appearance.pitcher),
str(appearance.batter),
len(appearance.out_runners_list),
len(appearance.scoring_runners_list),
len(appearance.runners_batted_in_list),
appearance.scorecard_summary,
appearance.got_on_base,
appearance.plate_appearance_summary,
appearance.plate_appearance_description,
appearance.error_str,
appearance.inning_outs)
)
df3 = pd.DataFrame(data=pitch_tuple_list_3, columns=['Pitcher',
'Batter',
'Out Runners',
'Scoring Runners',
'RBIs',
'Scorecard',
'On-base?',
'Plate Summary',
'Plate Description',
'Error',
'Inning Outs'])
for pitcher in df3['Pitcher'].unique():
summary = df3[df3['Pitcher'] == pitcher]['Plate Summary']
s = summary.value_counts(sort=False)
if len(summary) > 400:
fig, ax = plt.subplots()
ax.set_ylim(0, 250)
s.plot.bar()
plt.title(pitcher)
plt.show()
x = []
for pitcher in df3['Pitcher'].unique():
#f = df3[df3['Pitcher'] == pitcher]['On-base?'].value_counts()[0]
s = df3[df3['Pitcher'] == pitcher]['On-base?'].value_counts()
if len(s) == 2:
f = s[0]
t = s[1]
x.append((str(pitcher), f, t))
df4 = pd.DataFrame(data=x, columns=['Pitcher',
'Did not get on base',
'Got on base'])
df4.index = df4['Pitcher']
df4.sort_values(by=['Got on base']).nlargest(10, 'Did not get on base').plot.bar()