编写代码来搜索列出日托的网站。代码能够在100多页中首先获得相关信息。不知道如何容纳代码来刮擦网站过去的页面1.每个页面只列出25个日托,并且有150多页要刮。我阅读了一些关于在点击页码时引用的Post或Get方法,但是无法使用该信息修复我的代码(相信它是一个名为“manhattan”的用户共享解决方案。)这是一个副本当前的代码。感谢一些帮助:
import requests, bs4, re
cc=requests.get('https://data.nj.gov/childcare_results?center=&phone=&capacity_low=&capacity_high=&county=&city=&addr1=&zip=')
cc1=bs4.BeautifulSoup(cc.text,"html.parser")
#Trying to get all instances of <a href ="[website for inspection report for each daycare]> <a>, but line below returns many more instances of href. Will have to fix that.
#Code below does that for the first page in the above website, but no idea how to make it do the same for the remaining 150+ pages of listings
cc2 = cc1.select('div a')
print(len(cc2))
print (cc2)
答案 0 :(得分:1)
除了下载数据集之外,这是自动执行整个过程的最简单方法之一。要获取该链接,您需要仔细查看开发工具中xhr
下的network tab
部分。通过更改此网址25
的最后一个号码(https://data.nj.gov/views/INLINE/rows.json?accessType=WEBSITE&method=getByIds&asHashes=true&start=0&length=25
),您可以获得最多可设置的网址。试一试:
import requests
from bs4 import BeautifulSoup
data = {"columns":[{"id":346250849,"name":"COUNTY","dataTypeName":"text","description":"County that Childcare Center is located in.","fieldName":"county","position":1,"tableColumnId":21926709,"width":86,"format":{},"metadata":{}},{"id":346250850,"name":"CENTER","dataTypeName":"text","description":"Childcare Center name.","fieldName":"center","position":2,"tableColumnId":21926710,"width":342,"format":{},"metadata":{}},{"id":346250851,"name":"PHONE","dataTypeName":"phone","description":"Telephone number of Childcare Center.","fieldName":"phone","position":3,"tableColumnId":21926711,"width":142,"format":{"align":"left"},"metadata":{}},{"id":346250852,"name":"AGES","dataTypeName":"text","description":"Ages of children that Childcare Center provides services to.","fieldName":"ages","position":4,"tableColumnId":21926712,"width":80,"format":{"align":"center"},"metadata":{}},{"id":346250853,"name":"CAPACITY","dataTypeName":"number","description":"Capacity of Childcare Center.","fieldName":"capacity","position":5,"tableColumnId":21926713,"width":80,"format":{"precisionStyle":"standard","noCommas":"false","align":"center"},"metadata":{}},{"id":346250854,"name":"CITY","dataTypeName":"text","description":"City that Childcare Center is located in.","fieldName":"city","position":6,"tableColumnId":21926714,"width":183,"format":{},"metadata":{}},{"id":346250855,"name":"ADDR1","dataTypeName":"text","description":"Street Address of Childcare Center.","fieldName":"addr1","position":7,"tableColumnId":21926715,"width":232,"format":{},"metadata":{}},{"id":346250856,"name":"ADDR2","dataTypeName":"text","description":"Street Address of Childcare Center.","fieldName":"addr2","position":8,"tableColumnId":21926716,"width":241,"format":{},"metadata":{}},{"id":346250857,"name":"ZIP","dataTypeName":"text","description":"Zip code of Childcare Center location.","fieldName":"zip","position":9,"tableColumnId":21926717,"width":73,"format":{"precisionStyle":"standard","noCommas":"false","align":"right"},"metadata":{}},{"id":346250858,"name":"INSPECTIONS","dataTypeName":"dataset_link","description":"Link to webpage with inspection reports for the center","fieldName":"inspections","position":10,"tableColumnId":31789576,"width":100,"format":{},"metadata":{}}],"id":"cru5-4rmm","name":"Licensed Child Care Centers","attribution":"New Jersey Department of Children and Families","category":"Public Safety","description":"Searchable listing of all Licensed Child Care Centers and Inspection Reports.","displayType":"table","hideFromCatalog":False,"hideFromDataJson":False,"iconUrl":"fileId:_Judwh1-EhTD-ocxNjEi_f1JVO4iylkkrFqVjAbCZ6A","publicationAppendEnabled":False,"metadata":{"rdfSubject":"0","attachments":[{"filename":"Licensed_Child_Care_Centers_Explorer_Metadata_Description.docx","assetId":"VDUg-zFJ0y2Fsti8oTr4gkdMiqvg0a2JTk7Co7uxYRg","blobId":"","name":"Licensed_Child_Care_Centers_Explorer_Metadata_Description.docx"}],"custom_fields":{"Asset Details":{"Asset Posting Frequency":"Monthly","Geographic Coverage":"State","Asset Collection Frequency":"Monthly","Asset End Date":"Ongoing","Asset Provider Organization":"Children & Families","Asset Start Date":"1/15/15"},"Common Core":{"Contact Email":"ContactDataNJ@tech.nj.gov","Contact Name":"Data NJ Team","Publisher":"NJ OIT Open Data Center"}},"availableDisplayTypes":["table","fatrow","page"],"rowLabel":"Row","renderTypeConfig":{"visible":{"table":True}},"jsonQuery":{"where":{"operator":"AND","children":[{"operator":"CONTAINS","columnFieldName":"center","value":""},{"operator":"CONTAINS","columnFieldName":"phone","value":""},{"operator":"GREATER_THAN_OR_EQUALS","columnFieldName":"capacity","value":"0"},{"operator":"LESS_THAN_OR_EQUALS","columnFieldName":"capacity","value":"9999999"},{"operator":"CONTAINS","columnFieldName":"county","value":""},{"operator":"CONTAINS","columnFieldName":"city","value":""},{"operator":"CONTAINS","columnFieldName":"addr1","value":""},{"operator":"CONTAINS","columnFieldName":"zip","value":""},{"operator":"CONTAINS","columnFieldName":"zip","value":""}]},"order":[{"columnFieldName":"center","ascending":True}]}},"query":{"filterCondition":{"value":"AND","children":[{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"center"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"phone"},{"value":"","type":"literal"}],"type":"operator"},{"value":"GREATER_THAN_OR_EQUALS","children":[{"type":"column","columnFieldName":"capacity"},{"value":"0","type":"literal"}],"type":"operator"},{"value":"LESS_THAN_OR_EQUALS","children":[{"type":"column","columnFieldName":"capacity"},{"value":"9999999","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"county"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"city"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"addr1"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"zip"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"zip"},{"value":"","type":"literal"}],"type":"operator"}],"type":"operator"},"orderBys":[{"expression":{"type":"column","columnId":346250850},"ascending":True}]},"tags":["ool","child care","child care centers","dcf"],"flags":["default","restorable"],"originalViewId":"cru5-4rmm","displayFormat":{}}
res = requests.post("https://data.nj.gov/views/INLINE/rows.json?accessType=WEBSITE&method=getByIds&asHashes=true&start=0&length=25",json=data)
items = res.json()
for item in items:
links = BeautifulSoup(item['346250858'],"lxml")
link = links.find("a")['href']
print(link)
答案 1 :(得分:0)
如@KeyurPotdar所述,点击Download entire Dataset
下载csv似乎是一个不错的选择。拥有csv后,可以轻松提取链接。
>>> import csv
...
... with open('Licensed_Child_Care_Centers.csv', 'r') as f:
... reader = csv.DictReader(f)
... links = [row['INSPECTIONS'].split('"')[1] for row in reader]
...
>>> len(links)
3966
>>> for link in links[-10:]:
... print(link)
...
http://www.nj.gov/dcf/families/childcare/centers/15STE0001.shtml
http://www.nj.gov/dcf/families/childcare/centers/170800279.shtml
http://www.nj.gov/dcf/families/childcare/centers/170800294.shtml
http://www.nj.gov/dcf/families/childcare/centers/170800295.shtml
http://www.nj.gov/dcf/families/childcare/centers/170800297.shtml
http://www.nj.gov/dcf/families/childcare/centers/170800298.shtml
http://www.nj.gov/dcf/families/childcare/centers/170300016.shtml
http://www.nj.gov/dcf/families/childcare/centers/10REA0005.shtml
http://www.nj.gov/dcf/families/childcare/centers/170900195.shtml
http://www.nj.gov/dcf/families/childcare/centers/170900180.shtml