Question

我创建了一个网站刮刀，它将从黄页中删除所有信息（用于教育目的）

def actual_yellow_pages_scrape(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type):
print(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type)
r = requests.get(link,headers=REQUEST_HEADERS)
soup = BeautifulSoup(r.content,"html.parser")
workbook = xlwt.Workbook()
sheet = workbook.add_sheet(str(ypfind))
count = 0

for i in soup.find_all(class_="business-name"):
        sheet.write(count,0,str(i.text))
        sheet.write(count,1,str("http://www.yellowpages.com"+i.get("href")))
        r1 = requests.get("http://www.yellowpages.com"+i.get("href"))
        soup1 = BeautifulSoup(r1.content,"html.parser")
        website = soup1.find("a",class_="custom-link")
        try:
            print("Acquiring Website")
            sheet.write(count,2,str(website.get("href")))
        except:
            sheet.write(count,2,str("None"))
        email = soup1.find("a",class_="email-business")
        try:
            print(email.get("href"))
            EMAIL = re.sub("mailto:","",str(email.get("href")))
            sheet.write(count,3,str(EMAIL))
        except:
            sheet.write(count,3,str("None"))
        phonetemp = soup1.find("div",class_="contact")
        try:
            phone = phonetemp.find("p")
            print(phone.text)
            sheet.write(count,4,str(phone.text))
        except:
            sheet.write(count,4,str("None"))
        reviews = soup1.find(class_="count")
        try:
            print(reviews.text)
            sheet.write(count,5,str(reviews.text))
        except:
            sheet.write(count,5,str("None"))
        count+=1
save = dir+"\\"+ypfind+str(no)+".xls"
workbook.save(save)
no+=1
for i in soup.find_all("a",class_="next ajax-page"):
    print(i.get("href"))
    actual_yellow_pages_scrape("http://www.yellowpages.com"+str(i.get("href")),no,dir,gui,sel,ypfind,terminal,user,password,port,type)

代码是我上面的刮刀部分。我已经在汤和for循环中创建了断点，甚至没有一行for循环被执行。没有错误抛出。我尝试了相同的打印数字从1-10它的工作，但这不起作用为什么？

谢谢

Answer 1

已找到答案，

我使用文本可视化器来查找“r.content”中的内容我将其加入并获得了一个干净的HTML并浏览了HTML文件，最后发现浏览器不受支持，因此我删除了请求标头并运行了代码终于得到了我想要的东西

为什么它会跳过整个for循环？

1 个答案: