网页抓取网站时我的输出出现问题

时间:2021-02-24 10:41:10

标签: python python-3.x web-scraping beautifulsoup

我想在此处的所有链接上抓取公司的所有名称:

https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html

在每个链接中,都有几家公司,如下所示:

https://www.bilansgratuits.fr/classement/6420Z/default.html

我的目标是让所有这些公司拥有所有链接。

这是我目前的脚本:

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')


def clean_text(text):
    text = tokenizer.tokenize(text)
    final_text = ' '.join( [w for w in text] ) 
    return final_text
       


url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'

links = []

results = requests.get(url)


soup = BeautifulSoup(results.text, "html.parser")


links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]






names = []

root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]

for url in urls[:3]:

    results = requests.get(url)

    soup = BeautifulSoup(results.text, "html.parser")

    try:
        name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
                            

    except:
        name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]

    names.append(name)
       
        


for i in range(0,3):    
    rx = re.compile(r'^\s+$')

    names[i] = [item.split() for item in names[i] if not rx.match(item)]





data = pd.DataFrame({
    'names' : names
    })


data['names']= data['names'].apply(str)


data['names']= data['names'].apply(lambda x : clean_text(x))

print(data)

#data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')

我有这个输出:

output

但这不是我想要的,我希望每一行都有一个公司名称。

像这样:

outputdesired

对所有名称依此类推。

2 个答案:

答案 0 :(得分:1)

这是你想要的吗?

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html"
html = requests.get(url).text

follow_urls = [
    f"https://www.bilansgratuits.fr{anchor['href']}" for anchor
    in BeautifulSoup(html, "html.parser").select(".titreElementAnnuaire a")
]

data = []
for follow_url in follow_urls:
    print(f"Fetching: {follow_url}")
    css_selector = ".titreElementAnnuaire a" if "6411Z" in follow_url else ".classementTop .blocRaisonSociale > a"
    company_urls = BeautifulSoup(
        requests.get(follow_url).text,
        "html.parser",
    ).select(css_selector)
    data.extend(
        [
            [
                " ".join(anchor.getText(strip=True).split()),
                f"https://www.bilansgratuits.fr{anchor['href']}",
            ] for anchor in company_urls
        ]
    )

pd.DataFrame(data).to_csv("your_data.csv", index=False, header=["Company", "URL"])
print("Done!")

输出:.csv 文件中的 345 个条目:

enter image description here

答案 1 :(得分:0)

这是我的最终答案!

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import itertools





url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'

links = []

results = requests.get(url)

#time.sleep(20)

soup = BeautifulSoup(results.text, "html.parser")

links = [a['href']  for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]

secteur = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]


secteurs = []
URLS = []
names = []

root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]




for url, secteur in zip(urls[:3], secteur[:3]):

    results = requests.get(url)

    soup = BeautifulSoup(results.text, "html.parser")

    try:
        name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]

        for i in name:
            URLS.append(url)

        for i in name:
            secteurs.append(secteur)
   

    except:
        name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]

        for i in name:
            URLS.append(url)

        for i in name:
            secteurs.append(secteur)            

    names.append(name)




for i in range(0,3):    
    rx = re.compile(r'^\s+$')

    names[i] = [item.split() for item in names[i] if not rx.match(item)]



res = []
for list in names:
    for lis in list:
        res.append(' '.join([w for w in lis]))





data = pd.DataFrame({
    'names' : res,
    'URL' : URLS,
    'Secteur' : secteurs
    })


data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')