我想从网页中每个搜索结果的URL中提取信息。 在特定位置的页面上有关于公司的信息,对于每个结果,页面上都有与它们相关的详细信息,例如地址,公司名称,唯一税号等。
运行代码时,它无法提取原始页面的信息,而且我也不知道如何输入每个页面的链接并刮取详细信息,然后传递到下一个项目并执行相同操作,直到最后一个结果。
感谢您的帮助。
我的初始代码是
from bs4 import BeautifulSoup
import datetime
from tinydb import TinyDB, Query
import urllib3
import xlsxwriter
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = 'https://www.universidadperu.com/empresas/actividades-de-medicos-y-odontologo-categoria.php?dist=1501'
total_added = 0
def make_soup(url):
http = urllib3.PoolManager()
r = http.request("GET", url)
return BeautifulSoup(r.data,'lxml')
def main(url):
global total_added
db = TinyDB("Universidaddb.json")
while url:
print ("Web Page: ", url)
soup = soup_process(url, db)
nextlink = soup.find('a', rel=">>>")
url = False
if (nextlink):
url = nextlink['href']
print ("Added ",total_added)
make_excel(db)
def soup_process(url, db):
global total_added
soup = make_soup(url)
results1 = soup.find_all("ul")
#for result1 in results1:
results = soup.find_all("li")
for result in results:
try:
rec = {
'webpage': result.find['a'].get('href'),
'descr': result.a['href'].text
}
Result = Query()
s1 = db.search(Result.webpage == rec["webpage"])
if not s1:
total_added += 1
print ("Adding ... ", total_added)
db.insert(rec)
except (AttributeError, KeyError) as ex:
pass
return soup
def make_excel(db):
Headlines = ["Webpage", "Description"]
row = 0
workbook = xlsxwriter.Workbook('Universidad.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column(0,0, 20) # webpage
worksheet.set_column(1,1, 60) # Descr
for col, title in enumerate(Headlines):
worksheet.write(row, col, title)
for item in db.all():
row += 1
worksheet.write(row, 0, item['webpage'], string='Web Page' )
worksheet.write(row, 1, item['descr'] )
workbook.close()
main(url)