Question

任何帮助都会受到赞赏，因为我是python的新手。我创建了下面的Web Crawler，但它不会抓取所有页面，只有2页。它需要对所有页面进行哪些更改？

参见def trade_spider（max_pages）循环，在底部我有trade_spider（18），它应循环所有页面。

感谢您的帮助。

import csv
import re
import requests
from bs4 import BeautifulSoup

f = open('dataoutput.csv','w', newline= "")
writer = csv.writer(f)

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
            href = "http://www.zoopla.co.uk" + link.get('href')
            title = link.string 
            get_single_item_data(href) 
        page += 1
def get_single_item_data(item_url): 
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)

    for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
     address = item_name.get_text(strip=True)
writer.writerow([address])
trade_spider(18)

Answer 1

您的代码工作正常，它会抓取所有页面（尽管只有14页而不是18页）。看起来你试图刮掉街道地址，在这种情况下，第二个功能是不必要的，只是通过多次调用requests.get（）使你的爬虫变慢。我已经修改了一些代码，但这个代码更快。

import csv
import re
import requests
from bs4 import BeautifulSoup

f = open('dataoutput.csv','w', newline="")
writer = csv.writer(f)

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        furl = 'http://www.zoopla.co.uk/for-sale/property/nottingham/?price_max=200000&identifier=nottingham&q=Nottingham&search_source=home&radius=0&pn=' + str(page) + '&page_size=100'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)

        # Changed the class' value

        for link in soup.findAll('a', {'class': 'listing-results-address'}):     
            #href = "http://www.zoopla.co.uk" + link.get('href')
            #title = link.string 
            #get_single_item_data(href)
            address = link.get_text()
            print (address)               # Just to check it is working fine.
            writer.writerow([address])

        print (page)
        page += 1

# Unnecessary code

'''def get_single_item_data(item_url): 
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)

for item_name in soup.findAll('h2', {'itemprop': 'streetAddress'}):
    address = item_name.get_text(strip=True)
    writer.writerow([address])'''

trade_spider(18)

我的BeautifulSoup蜘蛛只抓取2页而不是所有页面

1 个答案: