Question

import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')

#CREATE CSV FILE

outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)

#IMPORT MATCHES

import csv
with open('matches.csv', 'rb') as f:
    reader = csv.reader(f)
    matches = list(reader)

for id in matches:
    id = str(id)
    id = re.sub("[^0-9]","",id)
    url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
    print (url)
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html)
#GET TEAMS AND SCORES

score = soup.findAll("div",{"class":"col-md-5 center"})

    team_home = score[0]
    team_home = str(team_home)
    team_home = re.search('title="(.*)" />',team_home)
    team_home = team_home.group(1)

    team_away = score[1]
    team_away = str(team_away)
    team_away = re.search('title="(.*)" />',team_away)
    team_away = team_away.group(1)

    goals_home = score[2]
    goals_home = str(goals_home)
    goals_home = re.sub('</h2></div>','',goals_home)
    goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)

    goals_away = score[3]
    goals_away = str(goals_away)
    goals_away = re.sub('</h2></div>','',goals_away)
    goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS

tables = soup.findChildren('table')

    stats_home = tables[0]
    list_of_rows_home = []
    for row in stats_home.findChildren('tr')[1:]:
        list_of_cells = []
        for cell in row.findChildren('td')[0]:
            text = cell.text
            list_of_cells.append(text)
        for cell in row.findChildren('td')[1]:
            text = cell.text
            list_of_cells.append(text)
        for cell in row.findChildren('td')[2:]:
            list_of_cells.append(cell)
        list_of_rows_home.append(list_of_cells) 

    for i in range(len(list_of_rows_home)):
        row = list_of_rows_home[i]
        cell = list_of_rows_home[i][2]
        cell = str(cell)
        goal = re.findall('goal',cell)
        goal = goal.count('goal')
        goal = goal / 2
        assist = re.findall('assist',cell)
        assist = assist.count('assist')
        assist = assist / 2
        motm = re.findall('motm',cell)
        motm = motm.count('motm')
        row.append(goal)
        row.append(assist)
        row.append(motm)

    for row in list_of_rows_home:
        del row[2]

    for i in range(len(list_of_rows_home)):
        row = list_of_rows_home[i]
        row.append(team_home)
        row.append(goals_home)
        row.append(team_away)
        row.append(goals_away)  

#GET AWAY STATS
stats_away = tables[1]
    list_of_rows_away = []
    for row in stats_away.findChildren('tr')[1:]:
        list_of_cells = []
        for cell in row.findChildren('td')[0]:
            text = cell.text
            list_of_cells.append(text)
        for cell in row.findChildren('td')[1]:
            text = cell.text
            list_of_cells.append(text)
        for cell in row.findChildren('td')[2:]:
            list_of_cells.append(cell)
        list_of_rows_away.append(list_of_cells)

    for i in range(len(list_of_rows_away)):
        row = list_of_rows_away[i]
        cell = list_of_rows_away[i][2]
        cell = str(cell)
        goal = re.findall('goal',cell)
        goal = goal.count('goal')
        goal = goal / 2
        assist = re.findall('assist',cell)
        assist = assist.count('assist')
        assist = assist / 2
        motm = re.findall('motm',cell)
        motm = motm.count('motm')
        row.append(goal)
        row.append(assist)
        row.append(motm)

    for row in list_of_rows_away:
        del row[2]

    for i in range(len(list_of_rows_away)):
        row = list_of_rows_away[i]
        row.append(team_away)
        row.append(goals_away)
        row.append(team_home)
        row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)

我的输入文件是一个基本的excel文件，匹配ID全部排在Excel文件的第一列。当它创建输出文件时，它是空白的。我也没有收到任何错误消息。

Answer 1

问题出在您的regex搜索中，因此可能会将其更改为：

team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)

<强>替代：

team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)

不需要/>，这实际上使 title =＆＃34;＆＃34; 与group（1）不匹配，而group（1）又会创建属性错误，并且脚本停止了。如果你想包含/>，那么删除正则表达式模式中的空格，因为这最终会杀死它。

我无法弄清楚为什么我得到一个空白的输出文件

1 个答案: