我正试图加快抓取过程。
在这里已经阅读了几个答案,但是无法使我的代码正常工作。
还,我应该在all_results附加项上使用锁吗?
最后,我希望我的线程仅在结果json中的“ lastpage”条目为True时停止,并且不适用于页面列表。有可能吗?
到目前为止的代码:
from queue import Queue
from threading import Thread
import pandas as pd
import requests
post_url = "https://www.nadlan.gov.il/Nadlan.REST/Main/GetAssestAndDeals"
all_results = []
def do_work():
while True:
null = None
false = False
true = True
page_to_work = q.get()
post_request_telaviv = {"MoreAssestsType": 0, "FillterRoomNum": 0, "GridDisplayType": 0,
"ResultLable": "תל אביב -יפו", "ResultType": 1, "ObjectID": "5000",
"ObjectIDType": "number", "ObjectKey": "SETL_CODE", "DescLayerID": "SETL_MID_POINT",
"Alert": null, "X": 180487.57, "Y": 665754.35, "Gush": null, "Parcel": null,
"showLotParcel": false, "showLotAddress": false, "OriginalSearchString": "תל אביב",
"MutipuleResults": false, "ResultsOptions": null, "CurrentLavel": 2,
"Navs": [{"text": "מחוז תל אביב - יפו", "url": null, "order": 1}],
"QueryMapParams": {"QueryToRun": null, "QueryObjectID": "5000",
"QueryObjectType": "number", "QueryObjectKey": "SETL_CODE",
"QueryDescLayerID": "KSHTANN_SETL_AREA", "SpacialWhereClause": null},
"isHistorical": false, "PageNo": 1, "OrderByFilled": null, "OrderByDescending": true,
"Distance": 0}
post_request = post_request_telaviv
post_request["PageNo"] = page_to_work
response = requests.post(post_url, data=post_request, headers={'connection': 'close'}, verify=True)
df_json = pd.read_json(response.text, orient="records")
for result in df_json.AllResults:
all_results.append(result)
print("{} done page {}".format(t.getName(), page_to_work))
q.task_done()
pages = [i for i in range(0, 25)]
concurrent = 20
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=do_work)
t.daemon = True
t.start()
for p in pages:
q.put(p)
q.join()