我正在尝试构建我的第一个 webscraper,所以请原谅我的代码一团糟。
但是我遇到了一个问题,我的程序卡在我的 while 循环中。
我想遍历总页数。所以我认为这应该可以通过 while 循环实现:
While huidigePagina (means current page in dutch) <= totalePagina:
...
...
huidigePagina = huidigePagina + 1
然而,即使在“+ 1”之后,我的 while 循环仍在继续。 或者当它工作时,我收到一个错误,我的汤.find 遇到了一个 NoneType ...... 我试过跳出 while 循环,但也没有用。
这是代码的全部内容:
links = []
GeoCodes = []
coordinaten = []
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
for body in soup.findAll('div', {'class': 'span-20 last'}):
numberPages = body.find('div', {'id': 'ctl00_ContentBody_ResultsPanel'}).find('table', {'class': 'NoBottomSpacing'}).findAll('b')
print(numberPages)
huidigePagina = 1
totalePagina = int(numberPages[2].text)
while huidigePagina <= totalePagina:
for row in soup.findAll('tr', {"class":["SolidRow Data BorderTop", "BeginnerRow Data BorderTop", "AlternatingRow Data BorderTop"]}):
checkbox = row.find('td').input
if checkbox is not None:
cacheCodeLocator = row.find("td", {"class": "Merge"}).findNextSibling("td").text
#print(cacheCodeLocator)
if "GC" in cacheCodeLocator:
b = cacheCodeLocator
c = re.findall("GC.....", b)
GeoCodes.append(c[0])
cachePage = row.find("td", {"class": "Merge"}).findNextSibling("td").a['href']
if cachePage is not None:
driver.get(str(cachePage))
coordcontent = driver.page_source
coordinatenpagina = BeautifulSoup(coordcontent, features="html.parser")
coordinaat = coordinatenpagina.find('div', class_='span-9').p.span.strong.span.text
coordinaten.append(coordinaat)
driver.back()
links.append(cachePage)
#driver.get(str(Cachepage))
nextPage = len(numberPages) - 1
print(huidigePagina, totalePagina)
if huidigePagina != totalePagina:
driver.find_element_by_xpath("//*[@id='ctl00_ContentBody_ResultsPanel']/table[1]/tbody/tr/td[2]/a[{}]".format(nextPage)).click()
content2 = driver.page_source
soup = BeautifulSoup(content2, features="html.parser")
body2 = soup.find('div', {'class': 'span-20 last'})
numberPages = body2.find('div', {'id': 'ctl00_ContentBody_ResultsPanel'}).find('table', {'class': 'NoBottomSpacing'}).findAll('b')
print(numberPages)
huidigePagina = huidigePagina + 1
totalePagina = int(numberPages[2].text)
print(huidigePagina, totalePagina)
if huidigePagina == totalePagina:
break
df = pd.DataFrame({'GeoCodes': GeoCodes, 'links': links, 'coordinaten': coordinaten})
df.to_csv('C_And_C.csv', index=True, encoding='utf-8')
driver.close()
这里有一些似乎有效的更新代码?
links = []
GeoCodes = []
coordinaten = []
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
for body in soup.findAll('div', {'class': 'span-20 last'}):
numberPages = body.find('div', {'id': 'ctl00_ContentBody_ResultsPanel'}).find('table', {'class': 'NoBottomSpacing'}).findAll('b')
print(numberPages)
huidigePagina = 1
totalePagina = int(numberPages[2].text)
while huidigePagina <= totalePagina:
for row in soup.findAll('tr', {"class":["SolidRow Data BorderTop", "BeginnerRow Data BorderTop", "AlternatingRow Data BorderTop"]}):
checkbox = row.find('td').input
if checkbox is not None:
cacheCodeLocator = row.find("td", {"class": "Merge"}).findNextSibling("td").text
#print(cacheCodeLocator)
if "GC" in cacheCodeLocator:
b = cacheCodeLocator
c = re.findall("GC.....", b)
GeoCodes.append(c[0])
cachePage = row.find("td", {"class": "Merge"}).findNextSibling("td").a['href']
if cachePage is not None:
driver.get(str(cachePage))
coordcontent = driver.page_source
coordinatenpagina = BeautifulSoup(coordcontent, features="html.parser")
coordinaat = coordinatenpagina.find('div', class_='span-9').p.span.strong.span.text
coordinaten.append(coordinaat)
driver.back()
links.append(cachePage)
#driver.get(str(Cachepage))
nextPage = len(numberPages) - 1
huidigePagina = huidigePagina + 1
if huidigePagina > totalePagina:
break
driver.find_element_by_xpath("//*[@id='ctl00_ContentBody_ResultsPanel']/table[1]/tbody/tr/td[2]/a[{}]".format(nextPage)).click()
content2 = driver.page_source
soup = BeautifulSoup(content2, features="html.parser")
body2 = soup.find('div', {'class': 'span-20 last'})
numberPages = body2.find('div', {'id': 'ctl00_ContentBody_ResultsPanel'}).find('table', {'class': 'NoBottomSpacing'}).findAll('b')
print(numberPages)
totalePagina = int(numberPages[2].text)
print(huidigePagina, totalePagina)
df = pd.DataFrame({'GeoCodes': GeoCodes, 'links': links, 'coordinaten': coordinaten})
df.to_csv('C_And_C.csv', index=True, encoding='utf-8')
driver.close()