实际上,该代码通常用于从网站上抓取数据,但问题在于,正在生成并保存在我的Excel工作表中的重复数据数量更多。
def extractor():
time.sleep(10)
souptree = html.fromstring(driver.page_source)
tburl = souptree.xpath("//table[contains(@id, 'theDataTable')]//tbody//tr//td[4]//a//@href")
for tbu in tburl:
allurl = []
allurl.append(urllib.parse.urljoin(siteurl, tbu))
for tb in allurl:
get_url = requests.get(tb)
get_soup = html.fromstring(get_url.content)
pattern = re.compile("^\s+|\s*,\s*|\s+$")
name = get_soup.xpath('//td[@headers="contactName"]//text()')
phone = get_soup.xpath('//td[@headers="contactPhone"]//text()')
mail = get_soup.xpath('//td[@headers="contactEmail"]//a//text()')
artitle = get_soup.xpath('//td[@headers="contactEmail"]//a//@href')
artit = ([x for x in pattern.split(str(artitle)) if x][-1])
title = artit[:-2]
for (nam, pho, mai) in zip(name, phone, mail):
fname = nam[9:]
allmails.append(mai)
allnames.append(fname)
allphone.append(pho)
alltitles.append(title)
fullfile = pd.DataFrame({'Names': allnames, 'Mails': allmails, 'Title': alltitles, 'Phone Numbers': allphone})
writer = ExcelWriter('G:\\Sheet_Name.xlsx')
fullfile.to_excel(writer, 'Sheet1', index=False)
writer.save()
print(fname, pho, mai, title, sep='\t')
while True:
time.sleep(10)
extractor()
try:
nextbutton()
except (WebDriverException):
driver.refresh()
except(NoSuchElementException):
time.sleep(10)
driver.quit()
我希望输出不应该重复,但是每次运行代码时,几乎一半以上的数据都在重复。