漂亮的Soup代码可以抓取所有结果及其内容页面

时间:2018-08-07 23:39:29

标签: python-3.x web-scraping beautifulsoup

我想从网页中每个搜索结果的URL中提取信息。 在特定位置的页面上有关于公司的信息,对于每个结果,页面上都有与它们相关的详细信息,例如地址,公司名称,唯一税号等。

运行代码时,它无法提取原始页面的信息,而且我也不知道如何输入每个页面的链接并刮取详细信息,然后传递到下一个项目并执行相同操作,直到最后一个结果。

感谢您的帮助。

我的初始代码是

from bs4 import BeautifulSoup
import datetime
from tinydb import TinyDB, Query
import urllib3
import xlsxwriter
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

url = 'https://www.universidadperu.com/empresas/actividades-de-medicos-y-odontologo-categoria.php?dist=1501'
total_added = 0

def make_soup(url):
    http = urllib3.PoolManager()
    r = http.request("GET", url)
    return BeautifulSoup(r.data,'lxml')

def main(url):
    global total_added
    db = TinyDB("Universidaddb.json")

    while url:
        print ("Web Page: ", url)
        soup = soup_process(url, db)
        nextlink = soup.find('a', rel=">>>")

        url = False
        if (nextlink):
            url = nextlink['href']

    print ("Added ",total_added)

    make_excel(db)

def soup_process(url, db):
    global total_added

    soup = make_soup(url)
    results1 = soup.find_all("ul")

    #for result1 in results1:
    results = soup.find_all("li")
    for result in results:
        try:
            rec = {
                'webpage': result.find['a'].get('href'),
                'descr': result.a['href'].text
            }

            Result = Query()
            s1 = db.search(Result.webpage == rec["webpage"])

            if not s1:
                total_added += 1
                print ("Adding ... ", total_added)
                db.insert(rec)

        except (AttributeError, KeyError) as ex:
            pass

    return soup

def make_excel(db):
    Headlines = ["Webpage", "Description"]
    row = 0

    workbook = xlsxwriter.Workbook('Universidad.xlsx')
    worksheet = workbook.add_worksheet()

    worksheet.set_column(0,0, 20) # webpage
    worksheet.set_column(1,1, 60) # Descr

    for col, title in enumerate(Headlines):
        worksheet.write(row, col, title)

    for item in db.all():
        row += 1
        worksheet.write(row, 0, item['webpage'], string='Web Page' )
        worksheet.write(row, 1, item['descr'] )

    workbook.close()

main(url)

0 个答案:

没有答案