行追加不使用循环

时间:2016-02-15 19:31:44

标签: python html loops web-scraping beautifulsoup

下面的代码是正常的,直到我将“trainer”字段添加到scrape.This字段是html中第二个兄弟的第二部分,代表Line2。其他字段代表源代码中的第1行。我得到了需要189行代码,但是当我包含提取训练器的代码时,我只会在每场比赛中获得最后一只狗(不包括所有其他5只狗)。这只是18行代码。由于某种原因,BS无法正常工作随着循环。包含教练场是破坏rows.append.Here是网址 http://www.gbgb.org.uk/resultsMeeting.aspx?id=135754 这是代码

import csv
from bs4 import BeautifulSoup
import requests


html = requests.get("http://www.gbgb.org.uk/resultsMeeting.aspx?id=135754").text
soup = BeautifulSoup(html,'lxml')

rows = []
for header in soup.find_all("div", class_="resultsBlockHeader"):
    track = header.find("div",    class_="track").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    date = header.find("div",   class_="date").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    datetime = header.find("div", class_="datetime").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    grade = header.find("div", class_="grade").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    distance = header.find("div", class_="distance").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    prizes = header.find("div", class_="prizes").get_text(strip=True).encode('ascii', 'ignore').strip("|")

    results = header.find_next_sibling("div",  class_="resultsBlock").find_all("ul", class_="line1")
    for result in results:
        fin = result.find("li", class_="fin").get_text(strip=True)
        greyhound = result.find("li", class_="greyhound").get_text(strip=True)
        trap = result.find("li", class_="trap").get_text(strip=True)
        sp = result.find("li", class_="sp").get_text(strip=True)
        timeSec = result.find("li", class_="timeSec").get_text(strip=True)
        timeDistance = result.find("li", class_="timeDistance").get_text(strip=True)


    results = header.find_next_sibling("div",  class_="resultsBlock").find_all("ul", class_="line2")
    for result in results:
         trainer = result.find("li",  class_="trainer").get_text(strip=True)



    rows.append({
            "track": track,
            "date": date,
            "greyhound": greyhound,
            "datetime":datetime,
            "sp" :sp,
            "grade":grade,
            "distance":distance,
            "prizes":prizes,
            "timeSec":timeSec,
            "timeDistance":timeDistance,
            "trap":trap,
            "fin":fin,
            "trainer":trainer

        })




with open("greyfile.csv", "w") as f:
    writer = csv.DictWriter(f,      ["track","date","trap","fin","greyhound","datetime","sp","grade","distance","prizes","timeSec","timeDistance","trainer"])

    for row in rows:
      writer.writerow(row)

1 个答案:

答案 0 :(得分:1)

我最好的猜测是,在第二个for循环下你有rows.append之前,所以要复制这两个行为的行为都在下面。

import csv
from bs4 import BeautifulSoup
import requests


html = requests.get("http://www.gbgb.org.uk/resultsMeeting.aspx?id=135754").text
soup = BeautifulSoup(html,'lxml')

rows = []
for header in soup.find_all("div", class_="resultsBlockHeader"):
    track = header.find("div",    class_="track").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    date = header.find("div",   class_="date").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    datetime = header.find("div", class_="datetime").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    grade = header.find("div", class_="grade").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    distance = header.find("div", class_="distance").get_text(strip=True).encode('ascii', 'ignore').strip("|")
    prizes = header.find("div", class_="prizes").get_text(strip=True).encode('ascii', 'ignore').strip("|")

    results = header.find_next_sibling("div",  class_="resultsBlock").find_all("ul", class_="line1")
    details = []
    for result in results:
        fin = result.find("li", class_="fin").get_text(strip=True)
        greyhound = result.find("li", class_="greyhound").get_text(strip=True)
        trap = result.find("li", class_="trap").get_text(strip=True)
        sp = result.find("li", class_="sp").get_text(strip=True)
        timeSec = result.find("li", class_="timeSec").get_text(strip=True)
        timeDistance = result.find("li", class_="timeDistance").get_text(strip=True)
        details.append({"greyhound": greyhound, "sp": sp, "fin": fin, "timeSec": timeSec, "timeDistance": timeDistance, "trap": trap, })

    results = header.find_next_sibling("div",  class_="resultsBlock").find_all("ul", class_="line2")
    for index, result in enumerate(results):
        trainer = result.find("li",  class_="trainer").get_text(strip=True)
        details[index]["trainer"] = trainer

    for detail in details:
        detail.update({"track": track, "date": date, "datetime": datetime, "grade": grade, "prizes": prizes})
        rows.append(detail)

with open("greyfile.csv", "w") as f:
    writer = csv.DictWriter(f,      ["track","date","trap","fin","greyhound","datetime","sp","grade","distance","prizes","timeSec","timeDistance","trainer"])

    for row in rows:
      writer.writerow(row)