Question

我正在尝试使用多处理池并行抓取网页并将结果写入单独的文件中。

由于Pool.apply会一直阻塞，直到功能完成为止，这使得爬网的执行方式类似于顺序行为，因此我将其更改为Pool.apply_async，并希望并行处理不同的区域。

class crawl_job(object):
    def __init__(self):
        self.website_url = 'https://services2.hdb.gov.sg/webapp/BR12AWRentalEnquiry/BR12PSearch.jsp'
        self.url ='https://services2.hdb.gov.sg/webapp/BR12AWRentalEnquiry/BR12SSearch?function=searchMarketRental'
        self.town_list = ["AMK","BD","BH","BB","BM","BP","BT","CT","CCK","CL","GL","HG","JE","JW","KWN","MP","PRC","PG","QT","SB","SK","SGN","TAP","TG","TP","WL","YS"]
        self.result_url = 'https://services2.hdb.gov.sg/webapp/BR12AWRentalEnquiry/BR12PResult.jsp'

        self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'}
        self.data = {
            'rbnSearchType': '1',
            'selTown': "AMK",
            'txtBlkFrm': '100',
            'txtBlkTo': '110',
            'dteMthFrm1': "Jun+2018",
            'dteMthTo1': "May+2019",
            'txtAmtFrm1':'',
            'txtAmtTo1':'',
            'txtBoundary':'',
            'txtBlk':'',
            'dteMthFrm2':'',
            'dteMthTo2':'',
            'txtAmtFrm2':'',
            'txtAmtTo2':'',
            'hdTempRentalEnq':''
            }
        self.params = {
            'function':'searchMarketRental'
        }
        self.s = requests.Session()
        self.s.headers.update(self.headers)
        self.s.get(self.website_url)
        self.result = []

    def crawl(self, town):
        result = []
        cur_date = datetime.now().strftime('%Y%m%d')
        filename = f"{town}_{cur_date}.csv"
        for i in range(0, 1010, 10):
            block_range = (i, i+9)

            self.data.update({'selTown':town,
                        'txtBlkFrm': block_range[0],
                        'txtBlkTo': block_range[1],
                        'dteMthFrm1': datetime.strftime(datetime.now() - relativedelta(months=2), '%b %Y'),
                        'dteMthTo1': datetime.strftime(datetime.now(), '%b %Y')})
            content = self.s.post(self.url, data=self.data, timeout=20, allow_redirects=False)
            if content.status_code == 302:
                if self.result_url == content.headers.get('location'):
                    output = self.s.get(content.headers.get('location'))
                    soup = BeautifulSoup(output.text, 'html.parser')
                    try:
                        table = soup.find('div',{'id':'result-non-paginated'})
                        table_body = table.find('tbody')
                        rows = table_body.find_all('tr')
                        for row in rows:
                            cols = row.find_all('td')
                            cols = [ele.text.strip() for ele in cols]
                            result.append([ele for ele in cols if ele])
                    except AttributeError as e:
                        pass
                    except requests.exceptions.RequestException as e:
                        with open('log.txt','a') as f:
                            f.write('{} Error when crawling {}: {}\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), town, block_range))

        return (path.join(path.dirname(__file__), 'hdb_rental', filename), result)
        # self.write_to_csv(path.join(path.dirname(__file__), 'hdb_rental', filename), result)

    def write_to_csv(self, x):
        filename, result = x
        with open(filename, 'a') as f:
            writer = csv.writer(f)
            writer.writerows(result)

if __name__ == '__main__':
    job = crawl_job()
    p_list = []
    pool = Pool(5)
    for town in job.town_list:
        pool.apply_async(job.crawl, (town, ), callback = job.write_to_csv)
    pool.close()
    pool.join()

对于每个文件，它应该只包含该区域的不同结果，但是我看到结果是混杂的。下面的一个输出文件，例如AMK：

Jun 2019,Ang Mo Kio,235,Ang Mo Kio Avenue 3,4-Rm,2100.00
Jul 2019,Bukit Batok,241,Bukit Batok East Avenue 5,3-Rm,1500.00
Jul 2019,Bedok,182,Bedok North Road,4-Rm,2000.00

通过多处理写入多个文件

0 个答案: