我有一个python抓取脚本,我只是在使用脚本,而且有错误,有人可以帮助我吗?
#!/usr/bin/python3.6
"""Timeform race results parser."""
import csv
import logging
import os
import sys
import traceback
from datetime import date, timedelta, datetime
import re
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), ".."))
import requests
from bs4 import BeautifulSoup
import pandas as pd
from openpyxl import load_workbook
from config import TIMEFORM_HEADERS, TIMEFORM_PAYLOAD
class TimeForm:
"""Logging."""
import logging
logging.basicConfig(
format="[%(asctime)s] %(levelname)s: %(message)s",
level=logging.INFO,
datefmt="%Y/%m/%dT%H:%M:%S",
filename=os.path.join(
"data",
"logs",
"{}Timeform.log".format(datetime.now().strftime("%Y-%m-%dT%H%M%S")),
),
)
logging.getLogger().addHandler(logging.StreamHandler())
def __init__(self):
self.session = requests.Session()
def excelize(self, dataframe, columns, date, answer):
"""Save dataframe as formatted Excel file."""
dataframe = dataframe[columns]
path = os.path.join("timeform", "Excel_files_out", date)
# logging.info(dataframe)
if answer == "r":
book = load_workbook(os.path.join("assets", "racingpost_f.xlsx"))
writer = pd.ExcelWriter(
os.path.join(path, "racingpost_f.xlsx"), engine="openpyxl"
)
else:
book = load_workbook(os.path.join("assets", "output_results.xlsx"))
writer = pd.ExcelWriter(
os.path.join(path, "Results-{}.xlsx".format(f[23:-4])),
engine="openpyxl",
)
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
dataframe.to_excel(
writer, "Main", startrow=1, startcol=0, index=False, header=False
)
writer.save()
def parse_race(self, url):
"""Get race data written to CSV file."""
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
}
p = self.session.get("https://www.timeform.com{}".format(url), headers=headers)
page = BeautifulSoup(p.text, "html.parser")
results = []
try:
common = {
"TF Cse": page.find("span", {"class": "rp-title-course-name"})
.contents[0]
.strip(),
"---": "",
"TF Date": (" ").join(
page.find("h2", {"title": "Date and time of race"}).string.split(
" "
)[1:]
),
"TF Time": page.find(
"h2", {"title": "Date and time of race"}
).string.split(" ")[0],
"Track Type": page.find("span", {"title": "The type of race"}).string,
"Going": page.find("span", {"title": "Race going"}).string,
"Ran": len(page.findAll("tbody", {"class": "rp-table-row"})),
"Track Distance": page.find(
"span", {"title": "Distance expressed in miles, furlongs and yards"}
).string,
}
horses = page.findAll("tbody", {"class": "rp-table-row"})
for i in horses:
hi_lo = 'False'
unique = {
"Pos": i.find("span", {"title": "Finishing Position"}).string,
"TF Horse": i.find("a", {"class": "rp-horse"}).string,
}
hi_lo = i.find("td", {"title": "Betfair In-Play Prices"}).string
try:
unique["ISP"] = i.find("span", {"class": "price-decimal"}).string
except:
try:
unique["ISP"] = i.find(
"span", {"class": "price-fractional"}
).string
except:
unique["ISP"] = ""
try:
unique["BSP"] = i.find(
"td", {"title": "Betfair Win SP"}
).string.strip()
except:
unique["BSP"] = ""
try:
unique["Place"] = i.find(
"td", {"title": "Betfair Place SP"}
).string.strip()
except:
unique["Place"] = ""
try:
unique["TF Age"] = i.find("td", {"title": "Horse age"}).string
except:
unique["TF Age"] = ""
try:
wgt = i.find("td", {"title": "Horse age"}).find_next("td").string
unique["Wgt"] = (
wgt.split("-")[0] + "st" + "-" + wgt.split("-")[1] + "pd"
)
except:
unique["Wgt"] = ""
try:
unique["TF Jockey"] = i.find("a", {"title": "Jockey"}).string
except:
unique["TF Jockey"] = ""
try:
unique["TF Trainer"] = i.find("a", {"title": "Trainer"}).string
except:
unique["TF Trainer"] = ""
unique["Hi Odds"] = hi_lo.split("/")[0].strip()
unique["Lo Odds"] = hi_lo.split("/")[1].strip()
unique.update(common)
results.append(unique)
if results[0]["BSP"] != "":
for i in results:
try:
i["BSP"] = str(float(i["BSP"]))
except:
i["BSP"] = "0"
bsp_raw = [float(i["BSP"]) for i in results]
bsp_raw.sort()
bsp_dict = {}
[
bsp_dict.update({str(key): str(num + 1)})
for num, key in enumerate(bsp_raw)
]
for i in results:
try:
i["BSP POS"] = bsp_dict[i["BSP"]]
except:
i["BSP POS"] = "N/A"
else:
for i in results:
i["BSP POS"] = ""
except Exception as e:
logging.info("2) ERROR :: {}".format(e))
if results:
return results
else:
logging.critical("No results for date found!")
# input("Press ENTER to quit...")
def parse_date(self, date, answer, bpath):
"""Get authorized in Timeform website."""
def f(x):
x.loc[-1] = pd.Series([])
return x
if answer == "r":
basic_path = os.path.join("data", date)
else:
basic_path = os.path.join("data", date, "MERGED")
if not os.path.exists(basic_path):
logging.critical("No data for date!")
raise IndexError
TIMEFORM_PAYLOAD["__RequestVerificationToken"] = BeautifulSoup(
self.session.get(
"""https://www.timeform.com/"""
"""horse-racing/account/"""
"""sign-in""",
headers=TIMEFORM_HEADERS,
).text,
"""html.""" """parser""",
).find("input", {"type": "hidden"})["value"]
r = self.session.post(
"""https://www.timeform.com/horse-racing/account/"""
"""handlelogin?returnUrl=%2Fhorse-racing%2F""",
data=TIMEFORM_PAYLOAD,
headers=TIMEFORM_HEADERS,
)
"""Getting races list for further parsing."""
p = self.session.get(
"""https://www.timeform.com/""" """horse-racing/results/{}""".format(date),
headers=TIMEFORM_HEADERS,
)
page = BeautifulSoup(p.text, "html.parser")
results = page.findAll("div", {"class": "w-results-holder"})[1:]
rows = []
resulted = []
for i in results:
title = i.find("h2", {"class": "results-header"}).string
logging.info("\n\tParsing {}...\n\tDate: {}\n".format(title, date))
for r in i.findAll("a", {"class": "results-title"}):
if "/result/" in r["href"]:
[rows.append(i) for i in self.parse_race(r["href"])]
if not rows:
logging.critical("1) No results/matches for date found!")
# input("Press ENTER to quit...")
if answer == "r":
basicfile = os.path.join(basic_path, "racingpost.csv")
else:
basicfile = bpath
df = pd.read_csv(basicfile)
df.columns = [
"Time",
"Horse",
"Race",
"Old",
"Update 1",
"New",
"Update 2",
"Diff",
]
logging.info(df.head())
df.to_csv(basicfile, index=None)
with open(basicfile, "r") as csvfile:
reader = csv.DictReader(csvfile)
timeforms = [i for i in reader]
if answer == "r":
columns = [
"Time",
"Course",
"Age",
"Distance",
"Form",
"Forc",
"Horse",
"Go",
"Go %",
"Dist",
"Dist %",
"Cse",
"Cse %",
"Trainer",
"T 14dys",
"T D W %",
"T D £1 + -",
"T O/A Seas",
"T S W %",
"T S £1 + -",
"RTF %",
"T 5yr O/A",
"T Won",
"Ran",
"T W %",
"Plcd",
"T O/A Track",
"T T W %",
"T T £1 + -",
"T F Won",
"T F W %",
"T F £1 + -",
"Jockey",
"J 14dys",
"J D W %",
"J D £1 + -",
"J O/A Seas",
"J S W %",
"J S £1 + -",
"J 5yr O/A",
"J Won",
"Rode",
"J W %",
"J O/A Track",
"J T W %",
"J T £1 + -",
"J F Won",
"J F W %",
"J F £1 + -",
"Mean",
"---",
"Pos",
"TF Cse",
"TF Horse",
"TF Jockey",
"TF Trainer",
"TF Age",
"Wgt",
"TF Date",
"TF Time",
"Track Type",
"Going",
"Ran",
"Hi Odds",
"Lo Odds",
"ISP",
"BSP",
"Place",
"BSP POS",
"Track Distance",
]
for r in timeforms:
try:
test = next(
item
for item in rows
if item["TF Jockey"]
== re.findall(',"(.*?)"', r["Jockey"])[0]
and item["TF Time"] == r["Time"]
)
# logging.info(test)
if test:
resulted.append({**r, **test})
except:
continue
else:
columns = [
"Time",
"Horse",
"Race",
"Old",
"Update 1",
"New",
"Update 2",
"Diff",
"---",
"Pos",
"TF Cse",
"TF Horse",
"TF Jockey",
"TF Trainer",
"TF Age",
"Wgt",
"TF Date",
"TF Time",
"Track Type",
"Going",
"Ran",
"Hi Odds",
"Lo Odds",
"ISP",
"BSP",
"Place",
"BSP POS",
"Track Distance",
]
for r in timeforms:
try:
test = next(
item
for item in rows
if r["Horse"].lower() in item["TF Horse"].lower()
and r["Horse"]
)
if test:
resulted.append({**r, **test})
except:
continue
logging.info('\n\nresulted :: {}\n\n'.format(resulted))
if resulted:
self.excelize(
pd.DataFrame(resulted)
.sort_values(["Time"], ascending=[True])
.groupby(["Time"], as_index=False)
.apply(f)
.fillna("")
.astype(str),
columns,
date,
answer
)
else:
logging.critical("2) No results/matches for date found!")
# input("Press ENTER to quit...")
def run_timeform(self):
import glob
def date_checker(text):
while True:
value = input(
"Please Enter date {} in YYYY-MM-DD format:\n".format(text.upper())
)
try:
check = datetime.strptime(value, "%Y-%m-%d")
break
except:
logging.info(
"Incorrect date format!\nPlease enter the format - YYYY-MM-DD"
)
return value
try:
while True:
answer = input("Is it for (m)erged or (r)acingpost?\n").lower()
if answer.lower() not in ["m", "r"]:
logging.info('Please choose "m" or "r"!')
else:
break
while True:
multiple_d = input(
"Would you like to merge multiple dates? [y] or [n]\n"
)
if multiple_d.lower() not in ["y", "n"]:
logging.info('Please choose "y" or "n"!')
else:
break
if multiple_d.lower() == "y":
multiple_d = True
else:
multiple_d = False
def process_files(idate, answer):
path_to = ""
if answer != "r".lower():
path_to = "MERGED"
ipath = os.path.join("data", idate, path_to, "*.csv")
all_files = glob.glob(
ipath, recursive=True
) # os.path.join('data', idate, 'MERGED') +
if len(all_files) > 0:
global f
for f in all_files:
try:
if not os.path.exists(
os.path.join("timeform", "Excel_files_out", idate)
):
os.makedirs(
os.path.join("timeform", "Excel_files_out", idate)
)
logging.info("Date: ".format(idate))
logging.info("File To Process: ".format(f[23:-4]))
self.parse_date(idate, answer.lower(), str(f))
except Exception as e:
logging.info("1) ERROR :: {}".format(e))
else:
logging.info("a) No results/matches for date found!")
if multiple_d:
date_from = date_checker("FROM")
date_to = date_checker("TO")
yf, mf, df = map(int, date_from.split("-"))
yt, mt, dt = map(int, date_to.split("-"))
start = date(yf, mf, df)
end = date(yt, mt, dt)
one_day = timedelta(days=1)
while start < end:
logging.info(
"\nParsing Date: {date.year}-{date.month:02}-{date.day:02}\n".format(
date=start
).center(
25, " "
)
)
process_files(
"{date.year}-{date.month:02}-{date.day:02}".format(date=start),
answer,
)
start += one_day
else:
process_files(date_checker(""), answer)
except:
traceback.print_exc()
input("Press ENTER to quit...")
# tf = TimeForm()
# tf.run_timeform()
它给了我一个错误:
Traceback (most recent call last):
File "C:/PycharmProjects/time/timeform.py", line 18, in <module>
from config import TIMEFORM_HEADERS, TIMEFORM_PAYLOAD
ImportError: cannot import name 'TIMEFORM_HEADERS'