-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_data.py
102 lines (81 loc) · 3.47 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
from bs4 import BeautifulSoup
import re
import pandas as pd
cwd = os.getcwd()
position = 'DEF'
week = '1'
html_path = cwd + '/data/week' + week + '/' + position + '/' + position + '_urlrawhtml.txt'
# Reading raw html stored in text file at path
def open_html(path):
with open(path, 'rb') as f:
print(f'Opening File: {html_path} \n')
return f.read()
html = open_html(html_path)
soup = BeautifulSoup(html, 'html.parser')
# Data is in a tabular format written in HTML
# Find the table row which contains the column names
columns = soup.find('tr', {'class': 'tableclmhdr'})
# The first player information is in the <tr> tag right after
cur_player = columns.findNext('tr')
finished = False
# Initialize player_data to empty list
player_data = []
# Initialize master list which will store all player data
all_data = []
while finished != True:
try:
info = cur_player.find_all('td', {'class': 'sort1'})
for i in range(len(info)):
# Special case to extract player name
if i == 0:
# Strip new line characters
text = info[i].get_text().replace('\n', '')
# String form is now '1. firstname lastname'
# Regex pattern to get everything after period
pattern = re.compile('\.(.*)')
match = pattern.search(text)
name = match.group(1).strip()
player_data.append(name)
else:
# For all other columns in the table
stat = info[i].get_text().strip()
player_data.append(stat)
# Last table row has no player data, hacky fix to not append it
if i == len(info) - 1:
all_data.append(player_data)
player_data = []
# Move onto the next player
cur_player = cur_player.findNext('tr')
except Exception as e:
print("Done parsing all data\n")
finished = True
# Based on position, columns of CSV change
if position == 'RB':
columns = ['Player', 'Team', 'Games', 'Attempts',
'RushingYards', 'RushingTD', 'Targets', 'Receptions',
'ReceivingYards', 'ReceivingTD', 'FantasyPoints', 'FantasyPointsPerGame']
elif position == 'WR':
columns = ['Player', 'Team', 'Games', 'Targets', 'Receptions',
'ReceivingYards', 'ReceivingTD', 'Attempts', 'RushingYards',
'RushingTD', 'FantasyPoints', 'FantasyPointsPerGame']
elif position == 'TE':
columns = ['Player', 'Team', 'Games', 'Targets', 'Receptions',
'ReceivingYards', 'ReceivingTD', 'FantasyPoints',
'FantasyPointsPerGame']
elif position == 'QB':
columns = ['Player', 'Team', 'Games', 'PassingCompletions', 'PassingAttempts',
'PassingYards', 'PassingTD', 'Interceptions','RushingAttempts',
'RushingYards', 'RushingTD', 'FantasyPoints', 'FantasyPointsPerGame']
elif position == 'K':
columns = ['Player', 'Team', 'Games', 'FGM', 'FGA',
'FGPercentage', 'EPM', 'EPA', 'FantasyPoints', 'FantasyPointsPerGame']
elif position == 'DEF':
columns = ['Team', 'Games', 'Sack', 'FumbleRecovery', 'Interception',
'DEFTD', 'PointsAllowed', 'PassingYardsAllowed', 'RushingYardsAllowed',
'Safety', 'KickTD', 'FantasyPoints', 'FantasyPointsPerGame']
# Initialize dataframe which will store master list of all player data
df = pd.DataFrame(all_data, columns = columns)
csv_path = cwd + '/data/week' + week + '/' + position + '/' + position + '.csv'
df.to_csv(csv_path, index = False)
print(f'{position} Dataframe written to csv\n')