我正在尝试使用多处理池并行抓取网页并将结果写入单独的文件中。
由于Pool.apply
会一直阻塞,直到功能完成为止,这使得爬网的执行方式类似于顺序行为,因此我将其更改为Pool.apply_async
,并希望并行处理不同的区域。
class crawl_job(object):
def __init__(self):
self.website_url = 'https://services2.hdb.gov.sg/webapp/BR12AWRentalEnquiry/BR12PSearch.jsp'
self.url ='https://services2.hdb.gov.sg/webapp/BR12AWRentalEnquiry/BR12SSearch?function=searchMarketRental'
self.town_list = ["AMK","BD","BH","BB","BM","BP","BT","CT","CCK","CL","GL","HG","JE","JW","KWN","MP","PRC","PG","QT","SB","SK","SGN","TAP","TG","TP","WL","YS"]
self.result_url = 'https://services2.hdb.gov.sg/webapp/BR12AWRentalEnquiry/BR12PResult.jsp'
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'}
self.data = {
'rbnSearchType': '1',
'selTown': "AMK",
'txtBlkFrm': '100',
'txtBlkTo': '110',
'dteMthFrm1': "Jun+2018",
'dteMthTo1': "May+2019",
'txtAmtFrm1':'',
'txtAmtTo1':'',
'txtBoundary':'',
'txtBlk':'',
'dteMthFrm2':'',
'dteMthTo2':'',
'txtAmtFrm2':'',
'txtAmtTo2':'',
'hdTempRentalEnq':''
}
self.params = {
'function':'searchMarketRental'
}
self.s = requests.Session()
self.s.headers.update(self.headers)
self.s.get(self.website_url)
self.result = []
def crawl(self, town):
result = []
cur_date = datetime.now().strftime('%Y%m%d')
filename = f"{town}_{cur_date}.csv"
for i in range(0, 1010, 10):
block_range = (i, i+9)
self.data.update({'selTown':town,
'txtBlkFrm': block_range[0],
'txtBlkTo': block_range[1],
'dteMthFrm1': datetime.strftime(datetime.now() - relativedelta(months=2), '%b %Y'),
'dteMthTo1': datetime.strftime(datetime.now(), '%b %Y')})
content = self.s.post(self.url, data=self.data, timeout=20, allow_redirects=False)
if content.status_code == 302:
if self.result_url == content.headers.get('location'):
output = self.s.get(content.headers.get('location'))
soup = BeautifulSoup(output.text, 'html.parser')
try:
table = soup.find('div',{'id':'result-non-paginated'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
result.append([ele for ele in cols if ele])
except AttributeError as e:
pass
except requests.exceptions.RequestException as e:
with open('log.txt','a') as f:
f.write('{} Error when crawling {}: {}\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), town, block_range))
return (path.join(path.dirname(__file__), 'hdb_rental', filename), result)
# self.write_to_csv(path.join(path.dirname(__file__), 'hdb_rental', filename), result)
def write_to_csv(self, x):
filename, result = x
with open(filename, 'a') as f:
writer = csv.writer(f)
writer.writerows(result)
if __name__ == '__main__':
job = crawl_job()
p_list = []
pool = Pool(5)
for town in job.town_list:
pool.apply_async(job.crawl, (town, ), callback = job.write_to_csv)
pool.close()
pool.join()
对于每个文件,它应该只包含该区域的不同结果,但是我看到结果是混杂的。
下面的一个输出文件,例如AMK
:
Jun 2019,Ang Mo Kio,235,Ang Mo Kio Avenue 3,4-Rm,2100.00
Jul 2019,Bukit Batok,241,Bukit Batok East Avenue 5,3-Rm,1500.00
Jul 2019,Bedok,182,Bedok North Road,4-Rm,2000.00