import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#CREATE CSV FILE
outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)
#IMPORT MATCHES
import csv
with open('matches.csv', 'rb') as f:
reader = csv.reader(f)
matches = list(reader)
for id in matches:
id = str(id)
id = re.sub("[^0-9]","",id)
url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
print (url)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#GET TEAMS AND SCORES
score = soup.findAll("div",{"class":"col-md-5 center"})
team_home = score[0]
team_home = str(team_home)
team_home = re.search('title="(.*)" />',team_home)
team_home = team_home.group(1)
team_away = score[1]
team_away = str(team_away)
team_away = re.search('title="(.*)" />',team_away)
team_away = team_away.group(1)
goals_home = score[2]
goals_home = str(goals_home)
goals_home = re.sub('</h2></div>','',goals_home)
goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)
goals_away = score[3]
goals_away = str(goals_away)
goals_away = re.sub('</h2></div>','',goals_away)
goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS
tables = soup.findChildren('table')
stats_home = tables[0]
list_of_rows_home = []
for row in stats_home.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_home.append(list_of_cells)
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
cell = list_of_rows_home[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_home:
del row[2]
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
row.append(team_home)
row.append(goals_home)
row.append(team_away)
row.append(goals_away)
#GET AWAY STATS
stats_away = tables[1]
list_of_rows_away = []
for row in stats_away.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_away.append(list_of_cells)
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
cell = list_of_rows_away[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_away:
del row[2]
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
row.append(team_away)
row.append(goals_away)
row.append(team_home)
row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)
我的输入文件是一个基本的excel文件,匹配ID全部排在Excel文件的第一列。当它创建输出文件时,它是空白的。我也没有收到任何错误消息。
答案 0 :(得分:0)
问题出在您的regex
搜索中,因此可能会将其更改为:
team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)
<强>替代强>:
team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)
不需要/>
,这实际上使 title =&#34;&#34; 与group(1)不匹配,而group(1)又会创建属性错误,并且脚本停止了。如果你想包含/>
,那么删除正则表达式模式中的空格,因为这最终会杀死它。